In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Installation

In [16]:
!pip install -qU pillow tiktoken

# Data Loading

In [17]:
import os
import json

FLUX_IMG_DIR = os.path.join("/content/drive/MyDrive", 'FLUX DATASET')
TEST_DATASET_THREE_PING = os.path.join(FLUX_IMG_DIR, 'TEST_DATASET_THREE_PING_FLORENCE')
print(TEST_DATASET_THREE_PING)
if not os.path.exists(TEST_DATASET_THREE_PING):
  os.makedirs(TEST_DATASET_THREE_PING)

/content/drive/MyDrive/FLUX DATASET/TEST_DATASET_THREE_PING_FLORENCE


# Supporting Methods

In [18]:
import tiktoken

def count_tokens(text):
  encoding = tiktoken.get_encoding("cl100k_base")
  num_tokens = len(encoding.encode(text))
  return num_tokens

In [19]:
from transformers import AutoProcessor, AutoModelForCausalLM

def model_loading(torch_dtype, device):
  return AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", torch_dtype=torch_dtype, trust_remote_code=True).to(device)

def processor_loading():
  return AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)


# Data Preprocessing
- Get list of images and txt
- Iterate
  - if a image has correlated txt file: Good!
    - "TEST_IMG_01.jpg" : "TETS_IMAGE_01.txt"
  - if a image doesn't have correlated txt file: Not Good!
    - Show the image
    - Get input
      - input should be at least have length of 20 words
      - if it's short get it again
    - Save correlated txt file.

In [20]:
# list of images
image_list = [image for image in os.listdir(TEST_DATASET_THREE_PING) if image.endswith(".jpg")]
# list of correlated text files
text_list = [text.split(".")[0] for text in os.listdir(TEST_DATASET_THREE_PING) if text.endswith(".txt")]

print(f"Number of images: {len(image_list)}")
print(f"Number of text files: {len(text_list)}")

image_needs_update = []
for image in image_list:
  if (image.split(".")[0] not in text_list):
    image_needs_update.append(image)

print(f"Number of images needs update: {len(image_needs_update)}")

Number of images: 10
Number of text files: 10
Number of images needs update: 0


## Model and Process Loading

In [21]:
import torch
from PIL import Image

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = model_loading(
    torch_dtype=torch_dtype,
    device=device
)

processor = processor_loading()

In [22]:
prompt = "<MORE_DETAILED_CAPTION>"

for image_file in image_needs_update:
  # image setting
  image = Image.open(os.path.join(TEST_DATASET_THREE_PING, image_file))
  inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)

  # getting a description about the image by Florence-2
  generated_ids = model.generate(
    input_ids=inputs["input_ids"],
    pixel_values=inputs["pixel_values"],
    max_new_tokens=1024,
    num_beams=3,
    do_sample=True
  )
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
  parsed_answer = processor.post_process_generation(generated_text, task="<MORE_DETAILED_CAPTION>", image_size=(image.width, image.height))
  generated_description = "[trigger] " + parsed_answer[prompt]

  # result
  display(image)
  print(generated_description)
  # saving the description about the image
  text_file = image_file.split(".")[0] + ".txt"
  text_file_path = os.path.join(TEST_DATASET_THREE_PING, text_file)
  with open(text_file_path, "w") as file:
    file.write(generated_description)

