In [None]:
from IPython.display import clear_output

In [None]:
!pip install git+https://github.com/huggingface/transformers accelerate
!pip install qwen-vl-utils[decord]==0.0.8
clear_output()

In [None]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)

# default processer
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels
)

In [None]:
import pandas as pd
import time

In [None]:
def infer_batch(image_paths: list[str], df: pd.DataFrame):
    """
    Thực hiện OCR batch cho nhiều ảnh bằng model đã load sẵn.

    Args:
        image_paths (list[str]): danh sách đường dẫn ảnh cần OCR.

    Returns:
        list[str]: list các chuỗi văn bản OCR tương ứng với từng ảnh.
    """
    # Tạo danh sách messages cho từng ảnh
    all_messages = []
    for img in image_paths:
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": img,
                    },
                    {
                        "type": "text",
                        "text": """Extract all text that is clearly visible and fully legible in the image.
                        Crucially, only transcribe text that is completely unobstructed, uncovered, and not partially hidden by any other objects or elements within the image.
                        Do not infer, guess, or hallucinate any text that is unclear, obscured, or not genuinely present.
                        Only output text that is definitively and entirely legible.
                        Present the extracted text line by line."""
                    },
                ],
            }
        ] 
        all_messages.append(messages)

    # Chuẩn bị input batch
    texts = [
        processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
        for msg in all_messages
    ]
    image_inputs, video_inputs = process_vision_info(all_messages)

    inputs = processor(
        text=texts,
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    # Batch inference
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=False,
        temperature=0.0,
        repetition_penalty=1.0,
    )
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_texts = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )

    batch_df = pd.DataFrame({
            "image_path": image_paths,
            "text_recognition": [txt if txt else "EMPTY" for txt in output_texts]
        })

        # Dùng pd.concat để gộp batch với df hiện tại
    df = pd.concat([df, batch_df], ignore_index=True)

    return df


In [None]:
import os
import time
import pandas as pd

file_name = os.listdir('/kaggle/input/aic-small-2024/Keyframes_L22/keyframes/L22_V002')
file_name = file_name[:22]
list_imgs = []
for i in range(len(file_name)):
    list_imgs.append(os.path.join('/kaggle/input/aic-small-2024/Keyframes_L22/keyframes/L22_V002', file_name[i]))

df = pd.DataFrame(columns=["image_path", "text_recognition"])

batch_size = 10
start = time.time()

for i in range(0, len(list_imgs), batch_size):
    batch_imgs = list_imgs[i:i+batch_size] 
    df = infer_batch(batch_imgs, df) 

end = time.time()

# In ra thời gian thực hiện
print("Time taken: ", end - start)

# Sau khi xử lý tất cả ảnh, bạn có thể lưu kết quả vào file CSV
df.to_csv('ocr_output.csv', mode='a', header=False, index=False)


In [None]:
df['text_recognition'][1]

In [None]:
df.head()