In [14]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Milvus
from langchain.document_loaders import TextLoader
from langchain.document_loaders import UnstructuredFileLoader, TextLoader
from loader import UnstructuredPaddleImageLoader, UnstructuredPaddlePDFLoader
from textsplitter import ChineseTextSplitter
from configs.model_config import *
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import os
os.environ['OPENAI_API_KEY'] = "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"

def write_check_file(filepath, docs):
    folder_path = os.path.join(os.path.dirname(filepath), "tmp_files")
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    fp = os.path.join(folder_path, 'load_file.txt')
    with open(fp, 'a+', encoding='utf-8') as fout:
        fout.write("filepath=%s,len=%s" % (filepath, len(docs)))
        fout.write('\n')
        for i in docs:
            fout.write(str(i))
            fout.write('\n')
        fout.close()
# SENTENCE_SIZE = 10000000
def load_file(filepath, sentence_size=SENTENCE_SIZE):
    if filepath.lower().endswith(".md"):
        loader = UnstructuredFileLoader(filepath, mode="elements")
        docs = loader.load()
    elif filepath.lower().endswith(".txt"):
        loader = TextLoader(filepath, autodetect_encoding=True)
        textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
        docs = loader.load_and_split(textsplitter)
    elif filepath.lower().endswith(".pdf"):
        loader = UnstructuredPaddlePDFLoader(filepath)
        textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
        docs = loader.load_and_split(textsplitter)
    elif filepath.lower().endswith(".jpg") or filepath.lower().endswith(".png"):
        loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
        textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
        docs = loader.load_and_split(text_splitter=textsplitter)
    else:
        loader = UnstructuredFileLoader(filepath, mode="elements")
        textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
        docs = loader.load_and_split(text_splitter=textsplitter)
    write_check_file(filepath, docs)
    return docs

# embeddings = OpenAIEmbeddings()
embedding_model: str = EMBEDDING_MODEL
embedding_device = EMBEDDING_DEVICE
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[embedding_model], model_kwargs={'device': embedding_device})

docs = load_file("./Chinese_Smart_Water.pdf", SENTENCE_SIZE)
# docs = load_file("./English.txt", SENTENCE_SIZE)
vector_db = Milvus.from_documents(
    docs,
    embeddings,
    connection_args={"host": "localhost", "port": "19530"},
)

INFO  2023-06-09 16:13:14,742-1d: Load pretrained SentenceTransformer: GanymedeNil/text2vec-large-chinese


download https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar to C:\Users\silen/.paddleocr/whl\det\ch\ch_PP-OCRv3_det_infer\ch_PP-OCRv3_det_infer.tar


100%|█████████████████████████████████████████████████████████████████████████████| 3.83M/3.83M [00:09<00:00, 407kiB/s]


download https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar to C:\Users\silen/.paddleocr/whl\rec\ch\ch_PP-OCRv3_rec_infer\ch_PP-OCRv3_rec_infer.tar


100%|█████████████████████████████████████████████████████████████████████████████| 11.9M/11.9M [00:16<00:00, 711kiB/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to C:\Users\silen/.paddleocr/whl\cls\ch_ppocr_mobile_v2.0_cls_infer\ch_ppocr_mobile_v2.0_cls_infer.tar


100%|█████████████████████████████████████████████████████████████████████████████| 2.19M/2.19M [00:04<00:00, 498kiB/s]
Batches: 100%|█████████████████████████████████████████████████████████████████████████| 40/40 [11:27<00:00, 17.20s/it]


In [17]:
query = "智慧水利可能面临的问题？"
docs = vector_db.similarity_search(query)

Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.16it/s]


In [23]:
print(docs[3].page_content)

 通过智慧水利建设，构建新时代的水利公共服务体系。


In [25]:
print(docs)

[Document(page_content='3存在问题近年来的水利信息化建设虽然取得了较大成绩，智慧水利建设 已进行了积极探索，但水利行业总体上还处于智慧水利建设的起步智慧行业相比，与推进国家水治理体系和治理能力现代化的需求相 比，在以下几个方面都存在较大差距。', metadata={'source': './Chinese_Smart_Water.pdf'}), Document(page_content='智慧水利是智慧社会的重要组成部分，是新时代水利信息化 发展的更高阶段，是水利现代化的前提条件，是推动水治理能力 现代化建设的客观要求。', metadata={'source': './Chinese_Smart_Water.pdf'}), Document(page_content='2 经济效益 139II: 弟一卓形劳与问题 围绕国家信息化战略、新时代水利改革发展要求以及新一代信 息技术等方面分析了水利信息化发展所面临的形势，同时从当前水利信息化基础设施建设、信息资源开发利用、网络与安全、水利业 务应用、水利网信工作管理等多方面分析了智慧水利建设的基础及 未来需要解决的主要问题。', metadata={'source': './Chinese_Smart_Water.pdf'}), Document(page_content=' 通过智慧水利建设，构建新时代的水利公共服务体系。', metadata={'source': './Chinese_Smart_Water.pdf'})]
