In [1]:
"""

将医疗数据集导入到ES，导入方法不是按照langchain的方式进行导入

"""
import os, json
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch



In [2]:


client = Elasticsearch("http://localhost:9200")

print(client.info())



{'name': 'autodl-container-5525458f7b-77aecf75', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'lce4zL7nTHO9vV4feMmyWQ', 'version': {'number': '8.15.1', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '253e8544a65ad44581194068936f2a5d57c2c051', 'build_date': '2024-09-02T22:04:47.310170297Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [5]:

all_example_list = []
all_query_list = []
with open("FreedomIntelligence___huatuo26_m-lite/format_data.jsonl", "r", encoding="utf-8") as fin:
    for line in fin:
        # print(line.strip())
        example_qa = json.loads(line)
        all_query_list.append(example_qa["question"])
        all_example_list.append(example_qa)
        


from modelscope import snapshot_download
embedding_model_dir = snapshot_download("AI-ModelScope/bge-base-zh-v1.5", revision='master', cache_dir="./bge-base-zh-v1.5")


# embedding_model_dir  = "./bge-base-zh-v1.5"
from FlagEmbedding import FlagModel

embedding_model = FlagModel(embedding_model_dir, query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：",
                  use_fp16=True)


print("转换embedding")
query_embeddings = embedding_model.encode_corpus(all_query_list, batch_size=512)

for i in range(len(all_query_list)):
    all_example_list[i]["question_vector"] = query_embeddings[0]




转换embedding


Inference Embeddings: 100%|██████████| 348/348 [01:52<00:00,  3.10it/s]


In [7]:

len(all_example_list[0]["question_vector"])


768

In [11]:

from elasticsearch import helpers



mapping = {
    "properties": {
      "id": {
        "type": "long"
      },
      "answer": {
        "type": "text",
        "analyzer": "ik_max_word",
        "search_analyzer": "ik_smart"
      },
      "score": {
        "type": "integer"
      },
      "label": {
        "type": "keyword"
      },
      "question": {
        "type": "text",
        "analyzer": "ik_max_word",
        "search_analyzer": "ik_smart"
      },
      "related_diseases": {
        "type": "keyword"
      },
      "question_vector": {
        "type": "dense_vector",
        "dims": 768,
        "similarity": "cosine"
      }
    }
  }

# client.indices.create(index="my_index", mappings=mappings)
client.indices
create_response = client.indices.create(index="rag_es_index", mappings=mapping)
print(create_response)


{'acknowledged': True, 'shards_acknowledged': True, 'index': 'rag_es_index'}


In [13]:
from tqdm import tqdm
all_new_example_list = []
for example_qa in tqdm(all_example_list):
    example_qa["question_vector"] = list(example_qa["question_vector"])
    all_new_example_list.append(example_qa)

print(type(all_new_example_list[0]["question_vector"]))


100%|██████████| 177703/177703 [00:08<00:00, 20169.90it/s]

<class 'list'>





In [15]:

from elasticsearch import helpers

# 为每个样本添加索引
all_new_example_list_added_index = []
for example_qa in all_new_example_list:
    example_qa["_index"] = "rag_es_index"
    all_new_example_list_added_index.append(example_qa)
    
# 预计耗费8分钟
helpers.bulk(client=client, actions=all_new_example_list_added_index)



(177703, [])

In [20]:
[ex["label"] for ex in all_example_list[:10]]

['眼耳鼻喉科',
 '皮肤性病科',
 '眼耳鼻喉科',
 '皮肤性病科',
 '皮肤性病科',
 '感染与免疫科',
 '口腔科',
 '皮肤性病科',
 '肿瘤科',
 '儿科']

In [46]:
# 定义embedding模型

# embedding_model 
# 上述模型已经定好了


query = "成年人感冒流鼻涕应该吃什么药"
label = "儿科"

query_embedding = embedding_model.encode_queries(queries=query)

print(len(list(query_embedding)))





768


In [49]:


query = "成年人感冒流鼻涕应该吃什么药"
label = "儿科"

query_embedding = embedding_model.encode_queries(queries=query).tolist()

# 貌似这个效果好
# response = client.search(index="rag_es_index", query={"bool": {"should": [{"term": {"label": label}}, 
#                                                                           {"match": {"question": {"query": query}}}]}})



# response = client.search(index="rag_es_index", query={"bool": {"should": [{"term": {"label": {"value": query, "boost": 0.1}}}, 
#                                                                           {"match": {"question": {"query": query, "boost": 0.9}}}]}})

response = client.search(index="rag_es_index", query={"bool": {"should": [{"term": {"label": {"value": query, "boost": 0.1}}}, 
                                                                          {"match": {"question": {"query": query, "boost": 0.9}}}]}},
                        knn={"field": "question_vector", "query_vector": query_embedding, "k": 5, "num_candidates": 10, "boost": 1.5},
                        source=["question", "answer", "label", "related_diseases", "score"])



In [50]:

response["hits"]["hits"]


[{'_index': 'rag_es_index',
  '_id': 'tltnlJIBHqNJn9d5SQau',
  '_score': 18.897446,
  '_source': {'answer': '根据你描述的症状，可能是感冒引起的。建议注意保暖，多喝水。',
   'score': 4,
   'label': '眼耳鼻喉科',
   'question': '我最近10多天经常这样，打喷嚏，流鼻涕，不知道吃什么药。请问医生这是感冒还是鼻炎。应该吃什么药。谢谢！',
   'related_diseases': '鼻炎'}},
 {'_index': 'rag_es_index',
  '_id': 'pVtolJIBHqNJn9d5m35l',
  '_score': 18.605427,
  '_source': {'answer': '哺乳期感冒了，应该注意饮食，以容易消化的软食为主，如果症状严重，需要同时口服抗生素。如果不能吃药的情况下，可以选择使用一些退烧贴进行一些身体的降温。药物治疗的话，还是建议到一个医院进行一个详细的检查，在专业的指导下进行一个药物使用。',
   'score': 4,
   'label': '妇产科',
   'question': '我这几天感冒了，一直流鼻涕还有点咳嗽，好怕传染给宝宝，哺乳期，母亲感冒发烧，影响孩子吃奶吗？应该吃什么药？这个应该怎么办，哺乳期宝妈感冒了吃什么药？',
   'related_diseases': '感冒'}},
 {'_index': 'rag_es_index',
  '_id': '0VtolJIBHqNJn9d5TWSR',
  '_score': 18.5161,
  '_source': {'answer': '针对感冒引起的流鼻涕，可以采取以下措施缓解症状：保持室内空气湿润，多喝水，多休息，避免吸烟和二手烟，可以使用盐水漱口或者盐水鼻喷来缓解鼻塞和流鼻涕。\n如果感冒后经常流的鼻涕为清水状，可以服用相关药物等治疗鼻黏膜充血、水肿。如果鼻涕为黄脓鼻涕，同时伴有嗓子痛的时候，感冒可能是由于细菌感染所引起的。药物的选择，一定要按照说明书进行用药。',
   'score': 5,
   'label': '内科',
   'question': '我感冒已经几天了，