In [1]:
import base64
import json
import os
import time

from openai import OpenAI

In [2]:
def encode_image(image_path):
	with open(image_path, "rb") as image_file:
		return base64.b64encode(image_file.read()).decode('utf-8')

In [3]:
## Initialize Client
base_url = "http://localhost:8010"
model = "Qwen/Qwen2-VL-7B-Instruct"

client = OpenAI(
    base_url=f"{base_url}/v1",
    api_key="token-abc123",
)

def get_response(messages):
	start = time.time()
	response = client.chat.completions.create(
		model=model,
		messages=messages,
		max_tokens=4096,
		temperature=0.1
	)
	end = time.time()
	print("Time: {:.3f}".format(end-start))
	print(response.choices[0])
	generated_message = response.choices[0].message.content
	return generated_message

In [4]:
article_id = "2305.00379"
page_num = 0
page1_img = encode_image(f"cache/pdf2img/{article_id}/{page_num}.png")

In [5]:
## Test messages
instruction = '''Read the abstract of the paper and summarize it into bullet points
Return like the following JSON
{"point": ["...", ...]}'''

messages = [
	{
		"role": "user",
		"content": [
			{"type": "text","text": instruction},
			{"type": "image_url","image_url": {"url": f"data:image/jpeg;base64,{page1_img}"}}
		]
	}
]

In [6]:
response = get_response(messages)

Time: 4.053
Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n  "point": [\n    "The paper proposes a Dual-Path Cooperative Filtering (DCF) model for image completion.",\n    "The DCF model uses Fast Fourier Convolution to extract multi-level features and predict dynamic kernels.",\n    "The model aims to fill in missing information while preserving local structure and generating visually realistic content.",\n    "Experiments on three challenging image completion datasets show that the proposed DCF outperforms state-of-the-art methods.",\n    "The paper addresses the limitations of existing image completion methods, such as poor cross-scene generalization and blurry artifacts.",\n    "The DCF model is designed to have a strong capacity to generalize across regions that are missing.",\n    "The paper compares the DCF model to baseline methods like RFRNet, JPGNet, and LaMa, showing that it generates high-fidelity and more realistic images."\n

In [7]:
response_dict = json.loads(response)
print(json.dumps(response_dict, indent = "\t", ensure_ascii=False))

{
	"point": [
		"The paper proposes a Dual-Path Cooperative Filtering (DCF) model for image completion.",
		"The DCF model uses Fast Fourier Convolution to extract multi-level features and predict dynamic kernels.",
		"The model aims to fill in missing information while preserving local structure and generating visually realistic content.",
		"Experiments on three challenging image completion datasets show that the proposed DCF outperforms state-of-the-art methods.",
		"The paper addresses the limitations of existing image completion methods, such as poor cross-scene generalization and blurry artifacts.",
		"The DCF model is designed to have a strong capacity to generalize across regions that are missing.",
		"The paper compares the DCF model to baseline methods like RFRNet, JPGNet, and LaMa, showing that it generates high-fidelity and more realistic images."
	]
}
