In [None]:
!pip install -q datasets

In [None]:
import yaml
import os
import json
import sys

from datasets import load_dataset
from  tqdm import tqdm


# model_tuple = ('vit_tucano_1b', 'TucanoBR/ViTucano-1b5-v1')
model_tuple = ('vit_tucano_2b', 'TucanoBR/ViTucano-2b8-v1')

outputs_dir = 'drive/MyDrive/Experimentos/gabriel/captions/'
images_dir = 'images/'

os.makedirs(images_dir, exist_ok=True)

outputs_dir = os.path.join(outputs_dir, model_tuple[0])

os.makedirs(outputs_dir, exist_ok=True)

max_length = 25

prompt = f'Escreva uma descriÃ§Ã£o em portuguÃªs do Brasil para a imagem com no mÃ¡ximo {max_length} palavras.'

In [None]:
outputs_dir

In [None]:
dataset_hub = 'laicsiifes/flickr30k-pt-br-human-generated'

print(f"\nLoading {dataset_hub}")

test_dataset = load_dataset(dataset_hub, split='test')

print(f"\n\tTotal of Examples: {len(test_dataset)}")

In [None]:
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f'\nDevice: {device}')

tokenizer = AutoTokenizer.from_pretrained(model_tuple[1])

model = AutoModelForCausalLM.from_pretrained(
    model_tuple[1],
    torch_dtype=torch.bfloat16, # for optimized inference  ðŸš€
    trust_remote_code=True)

model.to(device)

In [None]:
print(f'\nGenerating Captions Using {model_tuple[0]}\n')

outputs_file_path = os.path.join(outputs_dir, f'{model_tuple[0]}.json')

dict_images_processed = {}

if os.path.exists(outputs_file_path):
    with open(file=outputs_file_path, mode='r', encoding='utf-8') as json_file:
        output_data = json.load(json_file)
        for example in output_data:
            dict_images_processed[example['img_id']] = example

list_generated_captions = []

temp_image_path = f'{images_dir}/image.jpeg'

with tqdm(total=len(test_dataset), colour='green', file=sys.stdout,
          desc='Generating Captions') as pbar:

    for example in test_dataset:

        image = example['image']
        img_id = example['img_id']
        file_name = example['filename']
        reference_captions = example['caption']

        if img_id in dict_images_processed:
            data = dict_images_processed[img_id]
            list_generated_captions.append(data)
            pbar.update(1)
            continue

        image.save(temp_image_path)

        messages = [
            {
                'role': 'user',
                'content': [
                    {
                        'type': 'image'
                    },
                    {
                        'type': 'text',
                        'text': prompt
                    },
                ],
            }
        ]

        generated_caption, _ = model.chat(
            prompt=prompt,
            image=temp_image_path,
            tokenizer=tokenizer,
            max_new_tokens=max_length,
            temperature=0.1
        )

        list_generated_captions.append(
            {
                'img_id': img_id,
                'file_name': file_name,
                'reference_captions': reference_captions,
                'generated_caption': generated_caption
            }
        )

        with open(file=outputs_file_path, mode='w', encoding='utf-8') as json_file:
            json.dump(list_generated_captions, json_file, indent=4)

        pbar.update(1)