In [1]:
import logging
import os
import nest_asyncio

nest_asyncio.apply()


from dotenv import load_dotenv
from openai import AsyncAzureOpenAI

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

load_dotenv(verbose=True)

api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
api_key= os.getenv("AZURE_OPENAI_API_KEY")
deployment_name = 'gpt-4o'
api_version = '2024-02-15-preview' # this might change in the future

aAzureOpenclient =  AsyncAzureOpenAI(
        api_key=api_key,  
        api_version=api_version,
        base_url=f"{api_base}/openai/deployments/{deployment_name}"
    )

In [None]:
systemPrompt = { "role": 
                        "system", 
                "content": '''
                            You are a helpful assistant and you are a good video player. 
                            You know teh video games very well. You can give professional answer about video game's questions or screenshots. 
                            But you have answer the question or screenshot or enrich the content based on the given context and images.
                            The context format is a list which contain a few  like this:
                            
                            context1：
                                "caption": "The caption of the image",
                                "content": "The content of the image",
                                "ocrContent": "The ocr content of the image"
                                "images": "https://url1"

                            -----
                            context2：
                                "caption": "The caption of the image",
                                "content": "The content of the image",
                                "ocrContent": "The ocr content of the image"
                                "images": "https://url2"

                            -----
                            context3：
                                "caption": "The caption of the image",
                                "content": "The content of the image",
                                "ocrContent": "The ocr content of the image"
                                "images": "https://url3"
                                
                            You can use the caption,content and ocrContent to answer the question if they are related to the question.
                            And if you choose them to answer the question, you should return the related images url.
                            The response should be json format, have two parts: answer and related images url.
                            like this:
                            {
                                "answer": "The answer of the question",
                                "images": ["https://url1", "https://url2"]
                            }
                        ''' 
                }


In [22]:
systemPrompt = { "role": 
                        "system", 
                "content": '''
                            You are a helpful assistant and you are a good video player. 
                            You know teh video games very well. You can give professional answer about video game's questions or screenshots. 
                            But you have answer the question or screenshot or enrich the content based on the given context and images.
                            The context format is a list which contain a few  like this:
                            
                            context1：
                                "caption": "The caption of the image",
                                "content": "The content of the image",
                                "ocrContent": "The ocr content of the image"
                                "images": "https://url1"

                            -----
                            context2：
                                "caption": "The caption of the image",
                                "content": "The content of the image",
                                "ocrContent": "The ocr content of the image"
                                "images": "https://url2"

                            -----
                            context3：
                                "caption": "The caption of the image",
                                "content": "The content of the image",
                                "ocrContent": "The ocr content of the image"
                                "images": "https://url3"
                                
                            You can use the caption,content and ocrContent to answer the question if they are related to the question.
                            And if you choose them to answer the question, you should return the related images url.
                        ''' 
                }

from pydantic import BaseModel

class MultiModelResult(BaseModel):
    answer:str
    images:list[str]

In [13]:
import sys

# 假设 multiModelsEmbedding.py 文件在项目的根目录
sys.path.append(os.path.abspath("/Users/huqianghui/Downloads/git_temp/multi-model-data-process/"))


from search_utils import get_search_results_by_image_and_text,get_search_results_by_text,get_search_results_by_image

##### query text only

In [14]:
queryText = "DNF手游伤害为什么是黄字？"

In [24]:
results = await get_search_results_by_text(queryText)

conext = ""
for i, result in enumerate(results, start=1):
    conext += f"context{i}：\n\n"
    conext += f"caption:\n{result['caption']}\n\n"
    conext += f"content:\n{result['content']}\n\n"
    conext += f"ocrContent:\n{result['ocrContent']}\n\n"
    conext += f"imageUrl:\n{result['imageUrl']}\n\n"
    conext += "-------\n\n"

userPrompt = ''' user question is : {queryText} 

                answer the question based the context: {context}
'''.format(queryText=queryText,context=conext)

response = await aAzureOpenclient.beta.chat.completions.parse(
        model=deployment_name,
        response_format=MultiModelResult,
        messages=[
            systemPrompt,
            { "role": "user", "content": [  
                { 
                    "type": "text", 
                    "text": userPrompt
                }
            ] } 
        ],
        max_tokens=800 
    )

result = response.choices[0].message.content

from IPython.display import display, Image, Markdown
import json

resultDict = json.loads(result)
answer = resultDict['answer']
image_urls = resultDict['images']

# 显示答案文本
display(Markdown(f"**Answer:**\n\n{answer}"))

# 显示图像
for url in image_urls:
    display(Image(url=url))


2024-09-04 10:40:28,065 - INFO - HTTP Request: POST https://openai-hu-non-product-test.openai.azure.com//openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-02-01 "HTTP/1.1 200 OK"
2024-09-04 10:40:28,070 - INFO - Getting text embedding for DNF手游伤害为什么是黄字？
2024-09-04 10:40:29,300 - INFO - Request URL: 'https://ai-search-hu-west-us-3.search.windows.net/indexes('multi-model-index-zh-cn')/docs/search.post.search?api-version=REDACTED'
Request method: 'POST'
Request headers:
    'Content-Type': 'application/json'
    'Content-Length': '48876'
    'api-key': 'REDACTED'
    'Accept': 'application/json;odata.metadata=none'
    'x-ms-client-request-id': '0c576d30-6a67-11ef-a4e7-aee50117d95a'
    'User-Agent': 'azsdk-python-search-documents/11.6.0b4 Python/3.11.9 (macOS-14.6.1-arm64-arm-64bit)'
A body is sent with the request
2024-09-04 10:40:31,209 - INFO - Response status: 200
Response headers:
    'Transfer-Encoding': 'chunked'
    'Content-Type': 'application/json; odata.met

**Answer:**

在DNF手游中，"黄字伤害"指的是通过装备或技能等附加的攻击力增加。根据上下文，黄字伤害通常表示直接增加在主要伤害中的数字（例如，100攻击增加5%就是105）。这种伤害显示为"黄字"，而不是独立于主伤害的"白字"附加伤害。

如果有任何疑问，欢迎继续提问！

#### query text & image

In [25]:
queryText = "DNF手游伤害为什么是黄字？"
picture_url = "https://img2.tapimg.com/moment/etag/FvhNYMQT78nnCjAvBqHvY40FcH46.jpeg"

#### 从 AI Search 获取到相关内容，组装成prompt里面的context

In [26]:

results = await get_search_results_by_image_and_text(picture_url,queryText)

conext = ""
for i, result in enumerate(results, start=1):
    conext += f"context{i}：\n\n"
    conext += f"caption:\n{result['caption']}\n\n"
    conext += f"content:\n{result['content']}\n\n"
    conext += f"ocrContent:\n{result['ocrContent']}\n\n"
    conext += f"imageUrl:\n{result['imageUrl']}\n\n"
    conext += "-------\n\n"

2024-09-04 10:41:51,757 - INFO - HTTP Request: POST https://openai-hu-non-product-test.openai.azure.com//openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-02-01 "HTTP/1.1 200 OK"
2024-09-04 10:41:51,760 - INFO - Getting picture embedding for https://img2.tapimg.com/moment/etag/FvhNYMQT78nnCjAvBqHvY40FcH46.jpeg
2024-09-04 10:41:54,295 - INFO - Request URL: 'https://ai-search-hu-west-us-3.search.windows.net/indexes('multi-model-index-zh-cn')/docs/search.post.search?api-version=REDACTED'
Request method: 'POST'
Request headers:
    'Content-Type': 'application/json'
    'Content-Length': '47039'
    'api-key': 'REDACTED'
    'Accept': 'application/json;odata.metadata=none'
    'x-ms-client-request-id': '3f00a436-6a67-11ef-a4e7-aee50117d95a'
    'User-Agent': 'azsdk-python-search-documents/11.6.0b4 Python/3.11.9 (macOS-14.6.1-arm64-arm-64bit)'
A body is sent with the request
2024-09-04 10:41:56,101 - INFO - Response status: 200
Response headers:
    'Transfer-Encoding': 

In [27]:
userPrompt = ''' user question is : {queryText} 
                and with the image.

                answer the question based context is : {context}
'''.format(queryText=queryText,context=conext)

In [29]:
response = await aAzureOpenclient.beta.chat.completions.parse(
        model=deployment_name,
        messages=[
            systemPrompt,
            { "role": "user", "content": [  
                { 
                    "type": "text", 
                    "text": userPrompt
                },
                { 
                    "type": "image_url",
                    "image_url": {
                        "url": picture_url
                    }
                }
            ] } 
        ],
        response_format = MultiModelResult,
        max_tokens=800 
    )

result = response.choices[0].message.parsed
print(result)     
    

2024-09-04 10:43:02,758 - INFO - HTTP Request: POST https://openai-hu-non-product-test.openai.azure.com//openai/deployments/gpt-4o/chat/completions?api-version=2024-02-15-preview "HTTP/1.1 200 OK"


answer='在DNF手游中，伤害以黄字显示的原因是因为这些伤害属于“攻击增加伤害”类别。这意味着，这些增加的伤害直接计算在主要伤害数值中。例如，如果你有100攻击力，增加5%的黄字伤害后，最终展示的主要伤害就是105。这种展示方式与附加属性伤害以白字显示的不同，后者在主要伤害下单独列出。' images=['https://img2.tapimg.com/moment/etag/FvhNYMQT78nnCjAvBqHvY40FcH46.jpeg']


In [31]:


response = await aAzureOpenclient.beta.chat.completions.parse(
        model=deployment_name,
        response_format=MultiModelResult,
        messages=[
            systemPrompt,
            { "role": "user", "content": [  
                { 
                    "type": "text", 
                    "text": userPrompt
                },
                { 
                    "type": "image_url",
                    "image_url": {
                        "url": picture_url
                    }
                }
            ] } 
        ],
        max_tokens=800 
    )

result = response.choices[0].message.parsed
print(result)

2024-09-04 10:45:02,776 - INFO - HTTP Request: POST https://openai-hu-non-product-test.openai.azure.com//openai/deployments/gpt-4o/chat/completions?api-version=2024-02-15-preview "HTTP/1.1 200 OK"


answer='在DNF手游中，伤害数值以黄字显示是因为这种词条代表攻击增加伤害，直接作用在主伤害数字上。例如，如果你有100点攻击力，并增加了5%的伤害，那么主伤害显示就是105。这区别于白字，白字是在主伤害数字下方额外显示的属性伤害。这种差别在游戏设计中用于区分伤害类型，以便玩家调整和优化自己的角色输出策略。' images=['https://img2.tapimg.com/moment/etag/FvhNYMQT78nnCjAvBqHvY40FcH46.jpeg']


In [32]:
from IPython.display import display, Image, Markdown

answer = result.answer
image_urls = result.images

# 显示答案文本
display(Markdown(f"**Answer:**\n\n{answer}"))

# 显示图像
for url in image_urls:
    display(Image(url=url))

**Answer:**

在DNF手游中，伤害数值以黄字显示是因为这种词条代表攻击增加伤害，直接作用在主伤害数字上。例如，如果你有100点攻击力，并增加了5%的伤害，那么主伤害显示就是105。这区别于白字，白字是在主伤害数字下方额外显示的属性伤害。这种差别在游戏设计中用于区分伤害类型，以便玩家调整和优化自己的角色输出策略。