In [1]:
import sys 
sys.path.insert(0, '/home/ray/default')

import os
from util.utils import (
    generate_output_path,
    prompt_for_hugging_face_token
)
import ray
import os
from typing import Dict
import numpy as np
from vllm import LLM, SamplingParams

Failed to import from vllm._C with ImportError('libcuda.so.1: cannot open shared object file: No such file or directory')


In [21]:
## https://huggingface.co/datasets/DBQ/Burberry.Product.prices.United.States?row=0
HF_DATA = "DBQ/Burberry.Product.prices.United.States"

BASE_PATH = 's3://anyscale-customer-dataplane-data-production-us-east-2/artifact_storage/org_6687q89lgh27q3z41zesm2fsq6/cld_j25ipm5kli358v41pn9c96gjg3/BurberryData:john_:kpbdm'
IMG_PATH = BASE_PATH + "/images"
DATA_PATH = BASE_PATH + "/data"
CAPTION_PATH = BASE_PATH + "/captions/2"

IMG_PATH_TEST = "/home/ray/default/data/images"

In [22]:
HF_MODEL = "google/paligemma-3b-mix-224"

#### Run Config
There are two modes `test` and `prod`. Test will only operate on a small subset of the data

In [23]:
from enum import Enum, IntEnum
from pydantic import BaseModel, ValidationError

class RunMode(str, Enum):
    test = 'test'
    prod = 'prod'

In [24]:
mode = RunMode.prod

In [8]:
import numpy as np

for i in range(np.zeros((32,69,69)).shape[0]):
    print(np.zeros((32,69,69))[i].shape)
    break

(69, 69)


## Read Images

In [25]:
LIMIT = 10 if mode==RunMode.test else 1000
img_data = ray.data.read_images(IMG_PATH, include_paths=True, override_num_blocks=20).limit(LIMIT)

KeyboardInterrupt: 

## Inference with PaliGemma

In [None]:
class PaliGemmaPredictor:
    def __init__(self, prompt="caption en", image_col="image"):
        from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
        self.prompt = prompt
        self.image_col = image_col
        self.model_id = "google/paligemma-3b-mix-224"
        self.model = PaliGemmaForConditionalGeneration.from_pretrained(self.model_id).eval()
        self.processor = AutoProcessor.from_pretrained(self.model_id)

    def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
        import torch
        # Generate texts from the prompts.
        # The output is a list of RequestOutput objects that contain the prompt,
        # generated text, and other information.
        images = list(batch[self.image_col])
        prompts = [self.prompt] * len(images)
        model_inputs = self.processor(text=prompts, images=images, return_tensors="pt")
        input_len = model_inputs["input_ids"].shape[-1]

        with torch.inference_mode():
            generation = self.model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
            mask = torch.tensor([i>=input_len for i in range(generation.shape[1])]).repeat(generation.shape[0],1)
            indices = torch.nonzero(mask, as_tuple=True)
            decoded = self.processor.batch_decode(generation[indices].reshape(generation.shape[0],-1), skip_special_tokens=True)
        
        return {
            "captions": decoded,
            "path": batch['path'].tolist()
        }


In [None]:

## Test
# batch = img_data.take_batch(10)
# PaliGemmaPredictor()(batch)

2024-09-22 16:08:02,065	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-22_15-58-40_116452_2325/logs/ray-data
2024-09-22 16:08:02,066	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadImage] -> LimitOperator[limit=1000] -> LimitOperator[limit=10]


- ReadImage 1: 0 bundle [00:00, ? bundle/s]

- limit=1000 2: 0 bundle [00:00, ? bundle/s]

- limit=10 3: 0 bundle [00:00, ? bundle/s]

Running 0: 0 bundle [00:00, ? bundle/s]

In [None]:
ds = (
    img_data
    .map_batches(
        PaliGemmaPredictor,
        concurrency=4,
        num_gpus=1,    
        batch_size=100,
        accelerator_type="A10G",
        fn_constructor_kwargs={"image_col": "image"}
    )
)

In [None]:
ds.write_parquet(
        path=CAPTION_PATH,
        try_create_dir=False
    )

2024-09-22 16:10:09,394	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-22_15-58-40_116452_2325/logs/ray-data
2024-09-22 16:10:09,395	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadImage] -> LimitOperator[limit=1000] -> ActorPoolMapOperator[MapBatches(PaliGemmaPredictor)] -> TaskPoolMapOperator[Write]


- ReadImage 1: 0 bundle [00:00, ? bundle/s]

- limit=1000 2: 0 bundle [00:00, ? bundle/s]

- MapBatches(PaliGemmaPredictor) 3: 0 bundle [00:00, ? bundle/s]

- Write 4: 0 bundle [00:00, ? bundle/s]

Running 0: 0 bundle [00:00, ? bundle/s]



KeyboardInterrupt: 

In [None]:
ray.data.read_parquet(CAPTION_PATH).show(limit=1)

Parquet Files Sample 0:   0%|          | 0/2 [00:00<?, ? file/s]

2024-09-22 07:00:58,669	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-21_20-17-17_494561_3381/logs/ray-data
2024-09-22 07:00:58,670	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=1]


- ReadParquet->SplitBlocks(8) 1: 0 bundle [00:00, ? bundle/s]

- limit=1 2: 0 bundle [00:00, ? bundle/s]

Running 0: 0 bundle [00:00, ? bundle/s]

{'captions': 'a blue plaid scarf with a white tag on it.', 'path': 'anyscale-customer-dataplane-data-production-us-east-2/artifact_storage/org_6687q89lgh27q3z41zesm2fsq6/cld_j25ipm5kli358v41pn9c96gjg3/BurberryData:john_:kpbdm/images/0003C5D9-CD9D-4853-8A4C-86B331349517.png'}
