In [3]:
import asyncio
import os
import sys
from typing import Any, Dict, List

# 假设 multiModelsEmbedding.py 文件在项目的根目录
sys.path.append("/Users/huqianghui/Downloads/git_temp/multi-model-data-process/")

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import QueryType, VectorizedQuery
from dotenv import load_dotenv
from openai import AzureOpenAI

from multiModelsEmbedding import get_picture_embedding
from pictureFormatProcess import download_and_save_as_pdf
from pictureOcrProcess import analyze_document, get_image_caption_byCV

# Configure environment variables  
load_dotenv()  
azure_search_service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT") 
azure_search_index_name = os.getenv("AZURE_SEARCH_INDEX") 
azure_search_key = os.getenv("AZURE_COGNITIVE_SEARCH_KEY") 
azure_search_credential = AzureKeyCredential(azure_search_key)


azureOpenAIClient = AzureOpenAI(
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version = "2024-02-01",
  azure_endpoint =os.getenv("AZURE_OPENAI_BASE") 
)

azure_openAI_embedding_deployment = os.getenv("EMBEDDING_MODEL_DEPLOYMENT")


import logging
import os
from typing import Any, Dict, List

import requests

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Ensure environment variables are loaded
from dotenv import load_dotenv

load_dotenv(verbose=True)

azure_computer_vision_endpoint = "https://cv-hu-test-westus.cognitiveservices.azure.com/"
azure_computer_vision_key = "76fd3ae5ca8346dfa266636d8afc5478"

def get_text_embedding_by_computer_vision(text: str) -> List[float]:
    logging.info(f"Getting text embedding for {text}")
    
    url = azure_computer_vision_endpoint + "computervision/retrieval:vectorizeText?api-version=2024-02-01&model-version=2023-04-15"
    headers = {
        "Content-Type": "application/json",
        "Ocp-Apim-Subscription-Key": azure_computer_vision_key
    }
    body = {
        "text": text
    }

    response = requests.post(url, headers=headers, json=body)
    if response.status_code == 200:
        data = response.json()
        return data['vector']
    else:
        raise Exception(f"Error getting text embedding: {response.status_code} - {response.text}")


In [6]:
import nest_asyncio
nest_asyncio.apply()

pdf_dir = "docs/pdf"


query_image_url="https://img2.tapimg.com/moment/etag/FvhNYMQT78nnCjAvBqHvY40FcH46.jpeg"
query_text = "DNF手游伤害为什么是黄字？"

# # generate ocr content by form recognizer service
# pdfFileLocalPath =  asyncio.run(download_and_save_as_pdf(query_image_url,pdf_dir))
# ocrContent = asyncio.run(analyze_document(pdfFileLocalPath))
# captionByCV = asyncio.run(get_image_caption_byCV(query_image_url))

# query = ocrContent + captionByCV

aoaiResponse = azureOpenAIClient.embeddings.create(input = query_text,model = azure_openAI_embedding_deployment)  
aoai_embedding_query = aoaiResponse.data[0].embedding
#print(aoai_embedding_query)

cv_embedding_query = await get_picture_embedding(query_image_url)
#print(cv_embedding_query)

search_client = SearchClient(azure_search_service_endpoint, azure_search_index_name, AzureKeyCredential(azure_search_key))

aoai_embedding_query = VectorizedQuery(vector=aoai_embedding_query, 
                            k_nearest_neighbors=3, 
                            fields="contentVector,captionVector,ocrContentVecotor")

azure_cv_embedding_query = VectorizedQuery(vector=cv_embedding_query, 
                            k_nearest_neighbors=3, 
                            fields="imageVecotor")

results = search_client.search(  
    search_text=query_text,
    search_fields=["caption","content","ocrContent"],
    query_language="zh-cn",
    scoring_profile="firstProfile",   
    vector_queries=[aoai_embedding_query,azure_cv_embedding_query],
    query_type=QueryType.SEMANTIC, 
    semantic_configuration_name='default', 
    select=["id","caption", "content","imageUrl","ocrContent"],
    top=3
)
print("####################Results####################")

for result in results:
    print(f"Reranker Score: {result['@search.reranker_score']}")
    print(f"Score: {result['@search.score']}")  
    print(f"Captions: {result['@search.captions']}")  
    print(f"Highlights: {result['@search.highlights']}")  
    print(f"Content: {result['caption']}\n")  
    print(f"imageUrl: {result['imageUrl']}\n")  
    print("###############################")

2024-08-27 10:44:26,433 - INFO - HTTP Request: POST https://openai-hu-non-product-test.openai.azure.com//openai/deployments/text-embedding-ada-002/embeddings?api-version=2024-02-01 "HTTP/1.1 200 OK"
2024-08-27 10:44:26,439 - INFO - Getting picture embedding for https://img2.tapimg.com/moment/etag/FvhNYMQT78nnCjAvBqHvY40FcH46.jpeg
2024-08-27 10:44:28,395 - INFO - Request URL: 'https://ai-search-hu-west-us-3.search.windows.net/indexes('multi-model-index-zh-cn')/docs/search.post.search?api-version=REDACTED'
Request method: 'POST'
Request headers:
    'Content-Type': 'application/json'
    'Content-Length': '47039'
    'api-key': 'REDACTED'
    'Accept': 'application/json;odata.metadata=none'
    'x-ms-client-request-id': '478c53e2-641e-11ef-bf01-aee50117d95a'
    'User-Agent': 'azsdk-python-search-documents/11.6.0b4 Python/3.11.9 (macOS-14.6.1-arm64-arm-64bit)'
A body is sent with the request


####################Results####################


2024-08-27 10:44:30,440 - INFO - Response status: 200
Response headers:
    'Transfer-Encoding': 'chunked'
    'Content-Type': 'application/json; odata.metadata=none; odata.streaming=true; charset=utf-8'
    'Content-Encoding': 'REDACTED'
    'Vary': 'REDACTED'
    'Server': 'Microsoft-IIS/10.0'
    'Strict-Transport-Security': 'REDACTED'
    'Preference-Applied': 'REDACTED'
    'OData-Version': 'REDACTED'
    'request-id': '478c53e2-641e-11ef-bf01-aee50117d95a'
    'elapsed-time': 'REDACTED'
    'Date': 'Tue, 27 Aug 2024 02:44:30 GMT'


Reranker Score: 2.995098114013672
Score: 0.06612903624773026
Captions: None
Highlights: None
Content: DNF手游伤害篇，～（有用记得点赞）增伤词条区别:\n+n伤害 攻击增加伤害 这种词条是黄字。\n为啥是黄字捏？\n黄字就是你的攻击加n伤害然后显示在主伤害里面。（假如100攻击，加百分之5伤害。那么你的主词条就是105伤害）\n攻击时，附加n 攻击时，附加n属性伤害，这种词条是白字！ \n为啥是白字捏？\n白字就是，在你主伤害词条下面多出一个词条。\n附加的属性伤害是附加伤害的1.8倍。（假设你有100伤害，你攻击一下还是100不过下面会出现一个5）。\n暴击伤害与暴击\n暴击就是你攻击的2倍，（假设百分百暴击，100攻击，暴击伤害增加20的话就是220伤害）\n暴击100＝5暴击几率。还有暴击是红字…也是在主伤害条里面。\n附加伤害计算！！！\n6/8/5 这三个增加伤害 （1+0.06+0.08+0.05）＝1.19～\n注意啊，增加伤害最高0.8超过0.8开始稀释。\n所以伤害是什么？伤害是那个1！1是什么？1是攻击力！ 打铁还需自身硬，你装备➕多少了？

imageUrl: https://img2.tapimg.com/moment/etag/FvhNYMQT78nnCjAvBqHvY40FcH46.jpeg

###############################
Reranker Score: 1.2614123821258545
Score: 0.08224925398826599
Captions: None
Highlights: None
Content: 开服玩家必须要拿的九大资源！第二点第二点资源同样是冒险奖励这里大家每天都要来查看领取一下除了我刚刚说的武器还会送防具金币称号等奖励 前期尤其需要注意这个称号天空之城开括者增加32力量和智力 还有其他加成 没有买大礼包的小号一定要用这个称号 一直等满级之后系统才会送一个紫色称号

imageUrl: https://img2.tapimg.com/moment/etag/FsNbH5XW5cqys9PBAF6YtNid_NrB.jpg

#######################