In [None]:
import os
import wandb
import pandas as pd
from PIL import Image
from time import time
from evaluate import load
from torch.utils.data import Dataset
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import AutoProcessor, AutoModelForCausalLM

In [None]:
# Кастомный класс создания синтетического датасета
class CustomDataset(Dataset):
    def __init__(self, data_dir, data, transform=None):
        self.data_dir = data_dir
        self.data = data
        self.transform = transform
        self.images = [os.path.join(data_dir, img) for img in os.listdir(data_dir)]

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image_path = self.images[idx]
        image, caption = loader(image_path, self.data)
        if self.transform:
            image = self.transform(image)
        return image, caption


def loader(path, data):
    image = Image.open(path)
    caption = data.loc[data["image"] == os.path.basename(path), "caption"].values[0]
    return image, caption

In [None]:
data_dir = "datasets/captions_sin"

data = pd.read_csv(f"{data_dir}/captions.csv")

ds_sin = CustomDataset(f"{data_dir}/images", data)

In [None]:
# Расчет метрики METEOR
def metric_meteor(predicted_captions, reference_captions):
    meteor = load("meteor")
    meteor_avg = meteor.compute(
        predictions=predicted_captions, references=reference_captions
    )

    return meteor_avg

In [None]:
# Расчет метрики ROUGE
def metric_rouge(predicted_captions, reference_captions):
    rouge = load("rouge")
    rouge_avg = rouge.compute(
        predictions=predicted_captions, references=reference_captions
    )

    return rouge_avg

In [None]:
# Расчет метрики WER
def metric_wer(predicted_captions, reference_captions):
    wer = load("wer")
    wer_avg = wer.compute(predictions=predicted_captions, references=reference_captions)

    return wer_avg

In [None]:
# Простая модель генерации описания изображений (показатели снимались на GPU V100)
model_name = "microsoft/git-base"

wandb.init(project="child_diary", group=model_name, job_type="base")

# Загрузка модели генерации описаний изображений
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

predicted_captions = []
reference_captions = []

start_time = time()

# Выполнение предсказания модели
for image, captions in ds_sin:
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
    pred_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    predicted_captions.append(pred_caption)
    reference_captions.append(captions)

end_time = time()

rouge_result = metric_rouge(predicted_captions, reference_captions)

wandb.log(
    {
        "METEOR": metric_meteor(predicted_captions, reference_captions),
        "ROUGE-1": rouge_result["rouge1"],
        "ROUGE-2": rouge_result["rouge2"],
        "ROUGE-L": rouge_result["rougeL"],
        "WER": metric_wer(predicted_captions, reference_captions),
        "Speed 1 image": (end_time - start_time) / len(ds_sin),
    }
)

wandb.finish()

In [None]:
# Средняя модель генерации описания изображений (показатели снимались на GPU V100)
model_name = "Salesforce/blip-image-captioning-large"

wandb.init(project="child_diary", group=model_name, job_type="base")

# Загрузка модели генерации описаний изображений
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForConditionalGeneration.from_pretrained(model_name)

predicted_captions = []
reference_captions = []

start_time = time()

# Выполнение предсказания модели
for image, captions in ds_sin:
    inputs = processor(image, return_tensors="pt")
    out = model.generate(**inputs)
    predicted_captions.append(processor.decode(out[0], skip_special_tokens=True))
    reference_captions.append(captions)

end_time = time()

rouge_result = metric_rouge(predicted_captions, reference_captions)

wandb.log(
    {
        "METEOR": metric_meteor(predicted_captions, reference_captions),
        "ROUGE-1": rouge_result["rouge1"],
        "ROUGE-2": rouge_result["rouge2"],
        "ROUGE-L": rouge_result["rougeL"],
        "WER": metric_wer(predicted_captions, reference_captions),
        "Speed 1 image": (end_time - start_time) / len(ds_sin),
    }
)

wandb.finish()

In [None]:
# Сложная модель генерации описания изображений (показатели снимались на GPU V100)
model_name = "abhijit2111/Pic2Story"

wandb.init(project="child_diary", group=model_name, job_type="base")

# Загрузка модели генерации описаний изображений
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForConditionalGeneration.from_pretrained(model_name)

predicted_captions = []
reference_captions = []

start_time = time()

# Выполнение предсказания модели
for image, captions in ds_sin:
    inputs = processor(image, return_tensors="pt")
    out = model.generate(**inputs)
    predicted_captions.append(processor.decode(out[0], skip_special_tokens=True))
    reference_captions.append(captions)

end_time = time()

rouge_result = metric_rouge(predicted_captions, reference_captions)

wandb.log(
    {
        "METEOR": metric_meteor(predicted_captions, reference_captions),
        "ROUGE-1": rouge_result["rouge1"],
        "ROUGE-2": rouge_result["rouge2"],
        "ROUGE-L": rouge_result["rougeL"],
        "WER": metric_wer(predicted_captions, reference_captions),
        "Speed 1 image": (end_time - start_time) / len(ds_sin),
    }
)

wandb.finish()