In [2]:
from lavis.datasets.builders import load_dataset
from lavis.models import load_model_and_preprocess
from dotenv import load_dotenv
from PIL import Image
from algo import (
    PITOME,
    TOME,
    DIFFRATE,
    DCT,
    TOFU,
    LTMP,
    NONE, 
    pitome,
    tome,
    DiffRate,
    tofu,
    # ltmp
)

FLICKR_PATH='/media/caduser/MyBook/chau/.cache/flickr30k/images'
model, vis_processors, txt_processors = load_model_and_preprocess("blip_retrieval", "coco", is_eval=False)
dataset = load_dataset("flickr30k", vis_path=FLICKR_PATH, cfg_path=None)


Using downloaded and verified file: /media/caduser/MyBook/chau/.cache/flickr30k/annotations/train.json
Using downloaded and verified file: /media/caduser/MyBook/chau/.cache/flickr30k/annotations/val.json
Using downloaded and verified file: /media/caduser/MyBook/chau/.cache/flickr30k/annotations/test.json


In [3]:
dataset

{'train': <lavis.datasets.datasets.retrieval_datasets.RetrievalDataset at 0x7f60348f6e10>,
 'val': <lavis.datasets.datasets.retrieval_datasets.RetrievalEvalDataset at 0x7f5f8246aa50>,
 'test': <lavis.datasets.datasets.retrieval_datasets.RetrievalEvalDataset at 0x7f5f50fddc10>}

In [16]:

from lavis.common.registry import registry
from lavis.processors.base_processor import BaseProcessor
from lavis.processors.randaugment import RandomAugment
from omegaconf import OmegaConf
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from lavis.processors.blip_processors import BlipImageBaseProcessor


# Make sure the transform is correct for your model!
class BlipImageEvalProcessor(BlipImageBaseProcessor):
    def __init__(self, image_size=384, mean=None, std=None):
        super().__init__(mean=mean, std=std)

        self.transform = transforms.Compose(
            [
                transforms.Resize(
                    (image_size, image_size), interpolation=InterpolationMode.BICUBIC
                ),
                transforms.ToTensor(),
                self.normalize,
            ]
        )

    def __call__(self, item):
        return self.transform(item)

    @classmethod
    def from_config(cls, cfg=None):
        if cfg is None:
            cfg = OmegaConf.create()

        image_size = cfg.get("image_size", 384)

        mean = cfg.get("mean", None)
        std = cfg.get("std", None)

        return cls(image_size=image_size, mean=mean, std=std)


blip_processor = BlipImageEvalProcessor()
img = dataset['train'][0]['image']
img_input = blip_processor(img)
img 

tensor([[[-1.7777, -1.7631, -1.7485,  ...,  1.4924,  1.4048,  1.4340],
         [-1.7339, -1.7339, -1.7339,  ...,  1.7114,  1.4486,  1.3464],
         [-1.7631, -1.7485, -1.7047,  ...,  0.6457,  0.8355,  1.4924],
         ...,
         [-0.7558,  0.1493,  0.3391,  ..., -0.1718,  0.5727, -0.0113],
         [ 1.0544,  0.6749,  0.2223,  ...,  0.1785,  0.2807,  0.1639],
         [ 0.5289,  0.7041,  0.7625,  ...,  0.6749,  0.7625, -0.0696]],

        [[-1.6470, -1.6621, -1.6771,  ...,  2.0149,  2.0149,  1.9998],
         [-1.6621, -1.6771, -1.6621,  ...,  2.0749,  2.0149,  1.9848],
         [-1.7221, -1.7071, -1.6621,  ...,  1.4896,  1.6847,  2.0449],
         ...,
         [-0.3564,  0.6041,  0.7992,  ...,  0.4841,  1.0844,  0.6942],
         [ 1.3695,  1.0544,  0.6491,  ...,  0.8593,  0.9343,  0.9943],
         [ 0.8292,  1.2044,  1.2945,  ...,  1.1444,  1.0994,  0.3790]],

        [[-1.4091, -1.4091, -1.4233,  ...,  2.1317,  2.1459,  2.0890],
         [-1.4091, -1.4233, -1.4091,  ...,  2

{'train': <lavis.processors.blip_processors.BlipImageTrainProcessor at 0x7f60df09afd0>,
 'eval': <lavis.processors.blip_processors.BlipImageEvalProcessor at 0x7f60f046fd50>}