### 11B : Llama 3.2 11B Vision

### 모델, 토크나이저, 질의 데이터 초기화

In [None]:
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor, DataCollatorWithPadding, AutoTokenizer

model_id = "meta-llama/Llama-3.2-11B-Vision"
tokenizer = AutoTokenizer.from_pretrained(model_id)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)

### 질의 데이터 가져오기

In [None]:
from PIL import Image

english_1_url = "english_1_a_1.png"
english_2_url = "english_2_a_4.png"
english_3_url = "english_3_a_2.png"

math_1_url = "math_1_a_1.png" 
math_2_url = "math_2_a_5.png"
math_3_url = "math_3_a_5.png"

korean_1_url = "korean_1_a_1.png" 
korean_2_url = "korean_2_a_1.png"
korean_3_url = "korean_3_a_2.png"

english_1_url_image = Image.open(english_1_url)
english_2_url_image = Image.open(english_2_url)
english_3_url_image = Image.open(english_3_url)

math_1_url_image = Image.open(math_1_url)
math_2_url_image = Image.open(math_2_url)
math_3_url_image = Image.open(math_3_url)

korean_1_url_image = Image.open(korean_1_url)
korean_2_url_image = Image.open(korean_2_url)
korean_3_url_image = Image.open(korean_3_url)

### 영어 질의 하기

In [None]:
# 이미지 텍스트 추출
# 동작 시간 측정 
import time

start = time.time()
prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
<|image|>Extract words.
sentences<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

inputs = processor(english_1_url_image, prompt, return_tensors="pt").to(model.device)

output = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.7)
print(processor.decode(output[0]))

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 동작 시간 측정 
import time

start = time.time()

prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
<|image|>Provide an explanation of the correct answer to the question. Please speak korean.
sentences<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

inputs = processor(english_1_url_image, prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.7)
print(processor.decode(output[0]))

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 동작 시간 측정 
import time

start = time.time()

prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
<|image|>Provide an explanation of the correct answer to the question. Please speak korean.
sentences<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

inputs = processor(english_2_url_image, prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.7)
print(processor.decode(output[0]))

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 동작 시간 측정 
import time

start = time.time()

prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
<|image|>Provide an explanation of the correct answer to the question. Please speak korean.
sentences<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

inputs = processor(english_3_url_image, prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.7)
print(processor.decode(output[0]))

end = time.time()

print(f"{end - start:.5f} sec")

### 수학 질의 하기

In [None]:
# 동작 시간 측정 
import time

start = time.time()

prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
<|image|>Provide an explanation of the correct answer to the question. Please speak korean.
sentences<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

inputs = processor(math_1_url_image, prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.7)
print(processor.decode(output[0]))

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 동작 시간 측정 
import time

start = time.time()

prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
<|image|>Provide an explanation of the correct answer to the question. Please speak korean.
sentences<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

inputs = processor(math_2_url_image, prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.7)
print(processor.decode(output[0]))

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 동작 시간 측정 
import time

start = time.time()

prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
<|image|>Provide an explanation of the correct answer to the question. Please speak korean.
sentences<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

inputs = processor(math_3_url_image, prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.7)
print(processor.decode(output[0]))

end = time.time()

print(f"{end - start:.5f} sec")

### 국어 질의 하기

In [None]:
# 동작 시간 측정 
import time

start = time.time()

prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
<|image|>Provide an explanation of the correct answer to the question. Please speak korean.
sentences<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

inputs = processor(korean_1_url_image, prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7)
print(processor.decode(output[0]))

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 동작 시간 측정 
import time

start = time.time()

prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
<|image|>Provide an explanation of the correct answer to the question. Please speak korean.
sentences<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

inputs = processor(korean_2_url_image, prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7)
print(processor.decode(output[0]))

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 동작 시간 측정 
import time

start = time.time()

prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
<|image|>Provide an explanation of the correct answer to the question. Please speak korean.
sentences<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

inputs = processor(korean_3_url_image, prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7)
print(processor.decode(output[0]))

end = time.time()

print(f"{end - start:.5f} sec")

In [None]:
# 동작 시간 측정 
import time

start = time.time()

prompt = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
<|image|>Extract words.
sentences<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

inputs = processor(korean_3_url_image, prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=2000, do_sample=True, temperature=0.7)
print(processor.decode(output[0]))

end = time.time()

print(f"{end - start:.5f} sec")