In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [171]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader

from openai import OpenAI
client = OpenAI(api_key=os.getenv('OPEN_AI_KEY'))

# from langchain_elasticsearch import ElasticsearchStore
# from langchain_community.retrievers import TFIDFRetriever

In [46]:
# get the related file in file path
directory_path = './doc'
files = os.listdir(directory_path)

## 讀取檔案夾檔案

In [47]:
# one key-value pair is one file with its content
allFiles = {}

In [48]:
for file in files:
    file_name, file_extension = os.path.splitext(file)
    # get the document
    file_path = os.path.join(directory_path, file)
    
    # check the end file name
    if file.endswith('.txt') or file.endswith('.md'):
        # use txt to load the file
        loader = TextLoader(file_path)
        data = loader.load()
    elif file.endswith('.pdf'):
        # use PyPDFDirectoryLoader to load pdf file
        loader = PyPDFLoader(file_path)
        data = loader.load()
    else:
        # otherwise not support
        continue
    allFiles[file_name] = data

[Document(page_content='YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object\ndetectors\nChien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1\n1Institute of Information Science, Academia Sinica, Taiwan\nkinyiu@iis.sinica.edu.tw, alexeyab84@gmail.com, and liao@iis.sinica.edu.tw\nAbstract\nYOLOv7 surpasses all known object detectors in both\nspeed and accuracy in the range from 5 FPS to 160 FPS\nand has the highest accuracy 56.8% AP among all known\nreal-time object detectors with 30 FPS or higher on GPU\nV100. YOLOv7-E6 object detector (56 FPS V100, 55.9%\nAP) outperforms both transformer-based detector SWIN-\nL Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by\n509% in speed and 2% in accuracy, and convolutional-\nbased detector ConvNeXt-XL Cascade-Mask R-CNN (8.6\nFPS A100, 55.2% AP) by 551% in speed and 0.7% AP\nin accuracy, as well as YOLOv7 outperforms: YOLOR,\nYOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable\nDETR, DINO-5scale-R50, ViT-Adapter-B and

In [49]:
# for i in allFiles:
#     print(allFiles[i])

[Document(page_content='YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object\ndetectors\nChien-Yao Wang1, Alexey Bochkovskiy, and Hong-Yuan Mark Liao1\n1Institute of Information Science, Academia Sinica, Taiwan\nkinyiu@iis.sinica.edu.tw, alexeyab84@gmail.com, and liao@iis.sinica.edu.tw\nAbstract\nYOLOv7 surpasses all known object detectors in both\nspeed and accuracy in the range from 5 FPS to 160 FPS\nand has the highest accuracy 56.8% AP among all known\nreal-time object detectors with 30 FPS or higher on GPU\nV100. YOLOv7-E6 object detector (56 FPS V100, 55.9%\nAP) outperforms both transformer-based detector SWIN-\nL Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by\n509% in speed and 2% in accuracy, and convolutional-\nbased detector ConvNeXt-XL Cascade-Mask R-CNN (8.6\nFPS A100, 55.2% AP) by 551% in speed and 0.7% AP\nin accuracy, as well as YOLOv7 outperforms: YOLOR,\nYOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable\nDETR, DINO-5scale-R50, ViT-Adapter-B and

In [36]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [37]:
# download first
hug_embed = download_hugging_face_embeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [44]:
query_result = hug_embed.embed_query('hello world')
len(query_result)

384

In [51]:
# split the documents into piece
arr = []
for i in allFiles:
    data = allFiles[i]
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,  chunk_overlap=20)
    arr.append(text_splitter.split_documents(data))
    break

## Elasticsearch

使用es上的ca，還有user, password直接註冊

In [135]:
from elasticsearch import Elasticsearch

In [136]:
es_username = os.getenv('USERNAME')
es_password = os.getenv('PASSWORD')

In [137]:
hosts = ["https://172.16.1.205:9200", "https://172.16.1.206:9200", "https://172.16.1.207:9200"]

In [138]:
es = Elasticsearch(
    hosts=hosts,
    basic_auth=(es_username, es_password),
    verify_certs=True,
    ca_certs='./certs/http_ca.crt'
)

In [139]:
try:
    # get cluster info
    cluster_info = es.cluster.health()
    print("成功連接到 Elasticsearch, 集群訊息：", cluster_info)
except Exception as e:
    print("連接失敗:", e)

成功连接到 Elasticsearch，集群信息： {'cluster_name': 'my-application', 'status': 'green', 'timed_out': False, 'number_of_nodes': 3, 'number_of_data_nodes': 3, 'active_primary_shards': 84, 'active_shards': 168, 'relocating_shards': 0, 'initializing_shards': 0, 'unassigned_shards': 0, 'delayed_unassigned_shards': 0, 'number_of_pending_tasks': 0, 'number_of_in_flight_fetch': 0, 'task_max_waiting_in_queue_millis': 0, 'active_shards_percent_as_number': 100.0}


## 寫入mapping

In [140]:
# dims need to check the embedding, similarity has many options, i choose cosine
mapping = {
    "mappings": {
        "properties": {
            "embedding": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text": {
                "type": "text"
            }
        }
    }
}

In [141]:
index_name = "vector_search"
es.indices.create(index=index_name, body=mapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vector_search'})

## 建立ElasticsearchStore

In [158]:
# store = ElasticsearchStore(
#     embedding=hug_embed,
#     index_name="vector_search",
#     es_connection=es
# )

# next for loop need to set this, not use for to iterate each one to query
# db = store.from_documents(
#     i,
#     hug_embed,
#     index_name="vector_search",
# )
# db.client.indices.refresh(index="vector_search")

## 將向量資料寫入es

In [163]:
for i in arr:
    # file_name_with_extension = os.path.basename(i[0].metadata['source'])
    # # get file_name and file_extension
    # file_name, file_extension = os.path.splitext(file_name_with_extension)


    # # after that check page_content
    for j in i:
        # j.page_content will be the string
        text = j.page_content
        # vector will be embedding vector
        vector = hug_embed.embed_query(j.page_content)       
        write = {
            "embedding": vector,  # your vector
            "text": text,         # your text
        }
        # write the data into es
        es.index(index=index_name, body=write)
    

## 向量查詢

In [164]:
query = 'YPLOv7 outperforms which models'
# query = "Who is Ketanji Brown Jackson?"

In [165]:
query_result = hug_embed.embed_query(query)

In [166]:
es_query = {
  "knn": {
    "field": "embedding",
    "query_vector": query_result,
    "k": 3
  }
}

In [167]:
index_name = 'vector_search'
response = es.search(index=index_name, body=es_query)

In [168]:
response

ObjectApiResponse({'took': 42, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 3, 'relation': 'eq'}, 'max_score': 0.7695713, 'hits': [{'_index': 'vector_search', '_id': 'W1zAdY8Bob4RrNZHx8_2', '_score': 0.7695713, '_source': {'embedding': [-0.0006992057315073907, 0.006794158834964037, 0.061217255890369415, 0.043516770005226135, 0.06407414376735687, -0.06213737651705742, -0.05320874974131584, 0.04504915326833725, 0.051922623068094254, -0.012760432437062263, -0.060371194034814835, 0.0008794720051810145, -0.043961793184280396, -0.001731606782414019, 0.01457609236240387, 0.025795934721827507, 0.15916214883327484, -0.0061791446059942245, -0.11912669241428375, -0.051486119627952576, -0.05359676852822304, -0.0512518472969532, 0.00846325233578682, -0.012165357358753681, 0.04872315004467964, -0.04351532831788063, -0.03741211071610451, 0.019255876541137695, 0.026298552751541138, -0.050701335072517395, -0.03365696966648102, 0.01

In [169]:
vector_search_text = ''
for hit in response['hits']['hits']:
    vector_search_text += hit['_source']['text']
    vector_search_text += ', '

In [172]:
PromptTemplate=f'''
Here is the content which related to question
content: {vector_search_text}

Please answer the question
question: {query}
'''

In [173]:
PromptTemplate

'\nHere is the content which related to question\ncontent: From the results we see that if compared with YOLOv4,\nYOLOv7 has 75% less parameters, 36% less computation,\nand brings 1.5% higher AP. If compared with state-of-the-\nart YOLOR-CSP, YOLOv7 has 43% fewer parameters, 15%\nless computation, and 0.4% higher AP. In the performance\nof tiny model, compared with YOLOv4-tiny-31, YOLOv7-\ntiny reduces the number of parameters by 39% and the\namount of computation by 49%, but maintains the same AP.\nOn the cloud GPU model, our model can still have a higher, If we compare YOLOv7 with YOLOR using the input\nresolution 1280, the inference speed of YOLOv7-W6 is 8\nfps faster than that of YOLOR-P6, and the detection rate is\nalso increased by 1% AP. As for the comparison between\nYOLOv7-E6 and YOLOv5-X6 (r6.1), the former has 0.9%\nAP gain than the latter, 45% less parameters and 63% less\ncomputation, and the inference speed is increased by 47%.\nYOLOv7-D6 has close inference speed to YOLO

## 只用chat.completion來回覆問題

In [174]:
openai_response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {
        "role": "user",
        "content": PromptTemplate
      }
    ]
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [180]:
openai_response

ChatCompletion(id='chatcmpl-9Ofv8u19KrQ3GPiEe3XrvKf4KtRiP', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='YOLOv7 outperforms YOLOv4, YOLOR-CSP, YOLOv4-tiny-31, YOLOR-P6, YOLOv5-X6 (r6.1) in terms of parameters, computation, and average precision (AP).', role='assistant', function_call=None, tool_calls=None))], created=1715668042, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=59, prompt_tokens=409, total_tokens=468))

In [179]:
openai_response.choices[0].message.content

'YOLOv7 outperforms YOLOv4, YOLOR-CSP, YOLOv4-tiny-31, YOLOR-P6, YOLOv5-X6 (r6.1) in terms of parameters, computation, and average precision (AP).'

In [185]:
completion_tokens = openai_response.usage.completion_tokens
prompt_tokens = openai_response.usage.prompt_tokens
total_tokens = openai_response.usage.total_tokens

print(f"Completion Tokens: {completion_tokens}")
print(f"Prompt Tokens: {prompt_tokens}")
print(f"Total Tokens: {total_tokens}")

Completion Tokens: 59
Prompt Tokens: 409
Total Tokens: 468


## Openai 使用retrieveQA回覆問題

In [104]:
# llm = ChatOpenAI(
#     openai_api_key=os.environ['OPENAI_API_KEY'],
#     model_name='gpt-3.5-turbo',  
#     temperature=0.0 
# )

In [131]:
# 创建 TF-IDF 检索器
# tfidf_retriever = TFIDFRetriever

In [132]:
# qa = RetrievalQA.from_chain_type(
#     llm=llm, 
#     chain_type="stuff",
#     retriever=vector_search_text.tfidf_retriever()
# )

AttributeError: 'str' object has no attribute 'tfidf_retriever'

In [None]:
# qa.invoke(query)

測試使用api_key連結

In [22]:
# from elasticsearch import Elasticsearch

In [None]:
# k = os.getenv('api_key')

In [None]:
# client = Elasticsearch(
#   "https://172.16.1.205/",
#   api_key=os.getenv('api_key'),
#   verify_certs=True,
#   ca_certs='./certs/http_ca.crt'
# )

In [None]:
# API key should have cluster monitor rights
# client.info()

# try:
#     # 尝试获取集群信息
#     cluster_info = client.cluster.health()
#     print("成功连接到 Elasticsearch，集群信息：", cluster_info)
# except Exception as e:
#     print("连接失败:", e)

连接失败: Connection error caused by: ConnectionError(Connection error caused by: NewConnectionError(<elastic_transport._node._urllib3_chain_certs.HTTPSConnection object at 0x13bc36c50>: Failed to establish a new connection: [Errno 61] Connection refused))
