### Qwen2.5-VL-7B-Instruct

### 모델, 토크나이저, 질의 데이터 초기화

In [None]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, DataCollatorWithPadding, AutoProcessor
from qwen_vl_utils import process_vision_info

model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
)

# default processer
processor = AutoProcessor.from_pretrained(model_id)

### 질의 데이터 가져오기

In [None]:
from PIL import Image

english_1_url = "english_1_a_1.png"
english_2_url = "english_2_a_4.png"
english_3_url = "english_3_a_2.png"

math_1_url = "math_1_a_1.png" 
math_2_url = "math_2_a_5.png"
math_3_url = "math_3_a_5.png"

korean_1_url = "korean_1_a_1.png" 
korean_2_url = "korean_2_a_1.png"
korean_3_url = "korean_3_a_2.png"

english_1_url_image = Image.open(english_1_url)
english_2_url_image = Image.open(english_2_url)
english_3_url_image = Image.open(english_3_url)

math_1_url_image = Image.open(math_1_url)
math_2_url_image = Image.open(math_2_url)
math_3_url_image = Image.open(math_3_url)

korean_1_url_image = Image.open(korean_1_url)
korean_2_url_image = Image.open(korean_2_url)
korean_3_url_image = Image.open(korean_3_url)

### 영어 질의 하기

In [None]:
# 이미지 텍스트 추출
# 동작 시간 측정 
import time

start = time.time()

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": english_1_url_image,
            },
            {"type": "text", "text": "Extract every word"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=1000)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 이미지 텍스트 추출
# 동작 시간 측정 
import time

start = time.time()

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": english_1_url_image,
            },
            {"type": "text", "text": "Provide an explanation of the correct answer to the question. Please speak korean"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=300)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 이미지 텍스트 추출
# 동작 시간 측정 
import time

start = time.time()

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": english_2_url_image,
            },
            {"type": "text", "text": "Provide an explanation of the correct answer to the question. Please speak korean"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=300)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 이미지 텍스트 추출
# 동작 시간 측정 
import time

start = time.time()

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": english_3_url_image,
            },
            {"type": "text", "text": "Provide an explanation of the correct answer to the question. Please speak korean"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=300)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

end = time.time()

print(f"{end - start:.5f} sec")

### 수학 질의 하기

In [None]:
# 이미지 텍스트 추출
# 동작 시간 측정 
import time

start = time.time()

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": math_1_url_image,
            },
            {"type": "text", "text": "Provide an explanation of the correct answer to the question. Please speak korean"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=300)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 이미지 텍스트 추출
# 동작 시간 측정 
import time

start = time.time()

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": math_2_url_image,
            },
            {"type": "text", "text": "Provide an explanation of the correct answer to the question. Please speak korean"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=300)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 이미지 텍스트 추출
# 동작 시간 측정 
import time

start = time.time()

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": math_3_url_image,
            },
            {"type": "text", "text": "Provide an explanation of the correct answer to the question. Please speak korean"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=1000)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

end = time.time()

print(f"{end - start:.5f} sec")

### 국어 질의 하기

In [None]:
# 이미지 텍스트 추출
# 동작 시간 측정 
import time

start = time.time()

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": korean_3_url_image,
            },
            {"type": "text", "text": "Extract every word"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=2000)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 이미지 텍스트 추출
# 동작 시간 측정 
import time

start = time.time()

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": korean_1_url_image,
            },
            {"type": "text", "text": "Provide an explanation of the correct answer to the question. Please speak korean"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=300)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 이미지 텍스트 추출
# 동작 시간 측정 
import time

start = time.time()

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": korean_2_url_image,
            },
            {"type": "text", "text": "Provide an explanation of the correct answer to the question. Please speak korean"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=300)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 이미지 텍스트 추출
# 동작 시간 측정 
import time

start = time.time()

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": korean_3_url_image,
            },
            {"type": "text", "text": "Provide an explanation of the correct answer to the question. Please speak korean"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=300)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

end = time.time()

print(f"{end - start:.5f} sec")