In [1]:
import os
import re
from elasticsearch import Elasticsearch, helpers
from getpass import getpass
from langchain.embeddings import HuggingFaceEmbeddings

os.environ["RABBITMQ_HOST"] = "localhost"

from celery_tasks import ingest_data

CHUNK_SIZE = 400
ES_CHUNK_SIZE = 50
INDEX_NAME = "es_french_revo_idx"
MODEL_ID = "BAAI/bge-large-zh-v1.5"
MODEL_ID_ES = "baai__bge-large-zh-v1.5"
MODEL_DIM = 1024
MODEL_SIMILARITY = "cosine"

ES_HOST = "https://localhost:9200/"
ES_PASS = "y5AADXZR0l63CvTz1AsWznNiAM1Ukq7KSd3MEra"
# ES_PASS = getpass("ElasticSearch Password: ")
# COHERE_API_KEY = getpass("Elastic Api Key: ")

  from .autonotebook import tqdm as notebook_tqdm
  _transport = transport_class(


In [2]:
!eland_import_hub_model \
    -u elastic -p $ES_PASS \
    --url $ES_HOST \
    --hub-model-id $MODEL_ID \
    --task-type text_embedding \
    --insecure \
    --clear-previous \
    --start

2024-05-16 01:50:26,074 INFO : Establishing connection to Elasticsearch
  _transport = transport_class(
2024-05-16 01:50:26,104 INFO : Connected to cluster named 'docker-cluster' (version: 8.13.4)
2024-05-16 01:50:26,105 INFO : Loading HuggingFace transformer tokenizer and model 'BAAI/bge-large-zh-v1.5'
Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
STAGE:2024-05-16 01:50:36 6600:6600 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-05-16 01:50:38 6600:6600 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-05-16 01:50:38 6600:6600 ActivityProfilerController.cpp:322] Completed Stage: Post Processing
2024-05-16 01:50:45,752 INFO : Stopping deployment for model with id 'baai__bge-large-zh-v1.5'
2024-05-16 01:50:46,030 INFO : Deleting model with id 'baai__bge-large-zh-v1.5'
2024-05-16 01:50:54,095 INFO : Creating model with id 'baai__bge-large-zh-v1.5'
2024-

In [3]:
# Create the client instance
client = Elasticsearch(
    # For local development
    hosts=[ES_HOST],
    basic_auth=('elastic', ES_PASS), 
    verify_certs=False
)
print(client.info())

{'name': 'es01', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'i4zHr5YQTMik8-cO0UYiaw', 'version': {'number': '8.13.4', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'da95df118650b55a500dcc181889ac35c6d8da7c', 'build_date': '2024-05-06T22:04:45.107454559Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


  _transport = transport_class(


In [84]:
client.indices.delete(index=INDEX_NAME, ignore_unavailable=True)

# Setup the index
client.indices.create(
    index=INDEX_NAME,
    mappings={
        "dynamic": "true",
        "properties": {
            "passages": {
                "type": "nested",
                "properties": {
                    "vector": {
                        "properties": {
                            "predicted_value": {
                                "type": "dense_vector",
                                "index": True,
                                "dims": MODEL_DIM,
                                "similarity": MODEL_SIMILARITY,
                            }
                        }
                    }
                },
            }
        },
    },
)



ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'es_french_revo_idx'})

## Add some Documents through Celery

Now we can add documents with large amounts of text in body_content and automatically have them chunked, and each chunk text embedded into vectors by the model:

In [85]:
#Read MD File
def read_MD(md_file):
    f = open(md_file, 'r')
    docs = f.read()
    # 删除 markdown 标记
    docs = re.sub('#+ |\*+|_+|\> |\[\^[0-9]+\]|: ', '', docs)
    docs = re.sub('\n *[0-9]+\. +|\n- ', '\n', docs)
    # 按自然段分行
    docs = re.split('\n\n---\n\n|\n\n|\n', docs)
    # 删除空字符串
    docs = list(filter(lambda doc: len(doc) > 0, docs))
    title = docs[0]
    return {
        "title": title,
        "docs": docs[1:]
    }

In [86]:
root_directory = '../french_revo'
directories = [x[0] for x in os.walk(root_directory) if '.git' not in x[0]]
docs = []


for directory in directories[1:]:
    md_files = [f"{directory}/{md_file}" for md_file in os.listdir(directory)]
    for md_file in md_files:
        read_md = read_MD(md_file)
        docs += [{"text": doc, "title": read_md["title"], "file": md_file, "_index": INDEX_NAME} for _i, doc in enumerate(read_md["docs"])]

print(len(docs))
print(docs[0])

6098
{'text': '1793年9月5日将近正午时，巴黎市政厅前的广场上聚起了一大群人，他们准备去国民公会前游行。就在此时，巴黎和整个欧洲北部地区上空都陷入了黑暗之中。由于日食，从巴黎看去，太阳的四分之三区域在11点47分时就开始慢慢丧失光芒。而再往北走，就是日全食了。法国首都的人们就是在这样古怪的昏暗天色里准备他们的暴动的。这个打乱他们日程的变故让众人一片哗然，大家纷纷拿这事说笑。', 'title': '组织恐怖', 'file': '../french_revo/12_who_ruled/12_who_ruled_chapter03.md', '_index': 'es_french_revo_idx'}


In [87]:
# Add the documents to the index directly
for i in range(0, len(docs), ES_CHUNK_SIZE):
    ingest_data.apply_async(
        kwargs={
            "docs": docs[i: min(i + ES_CHUNK_SIZE, len(docs))]
        }
    )

### Aside: Pretty printing Elasticsearch responses

Your API calls will return hard-to-read nested JSON. We'll create a little function called pretty_response to return nice, human-readable outputs from our examples.

In [94]:
def pretty_response(response):
    if len(response["hits"]["hits"]) == 0:
        print("Your search returned no results.")
    else:
        for hit in response["hits"]["hits"]:
            id = hit["_id"]
            score = hit["_score"]
            doc_title = hit["_source"]["title"]
            passage_text = ""

            for passage in hit["inner_hits"]["passages"]["hits"]["hits"]:
                passage_text += passage["fields"]["passages"][0]["text"][0] + "\n\n"
            text = hit["_source"]["text"]

            pretty_output = f"ID: {id}\nDoc Title: {doc_title}\nText:\n{text}\nScore: {score}"
            print(pretty_output)
            print("---")

1. 下载ES数据上传到云
1. 写一个 OpenAI chatbot
1. 部署到云上
1. 写个视频

In [95]:
response = client.search(
    index=INDEX_NAME,
    knn={
        "inner_hits": {"size": 1, "_source": False, "fields": ["passages.text"]},
        "field": "passages.vector.predicted_value",
        "k": 20,
        "num_candidates": 100,
        "query_vector_builder": {
            "text_embedding": {
                "model_id": MODEL_ID_ES,
                "model_text": "谁在救国委员会主管军火？",
            }
        },
    },
)

pretty_response(response)

ID: JgN_f48BtYwLroEYfJIR
Doc Title: 第二章 革命政府（1793—1794）
Text:
在救国委员会中，两名工兵军官卡诺和科多尔的普里厄在陆军部长布硕特（芽月前）的协助下，负责组织和指挥这支军队。普里厄主管军火，有关粮秣车马的事务由兰代襄助；卡诺以主帅身份统一部署军事行动。
Score: 0.8196014
---
ID: qwOAf48BtYwLroEYhppP
Doc Title: 山岳派国民公会：人民运动和救国专政（1793年）
Text:
在无套裤汉陆军部长布肖特的积极配合下，救国委员会调整了对战争的领导，有力地推动了战事。1793年8月14日，职业军官卡诺和科多尔省的普里厄进入救国委员会主持军务。前者负责指导作战，后者负责制造军火。但战役的计划、将军的任命都须经过救国委员会全体会议讨论。罗伯斯庇尔（据他的笔记本中的记载所示）和圣茹斯特在指导战争方面起着重要作用。让邦·圣安德烈在长期任职中领导并发展了铸造工场、步枪制造工场、硝石工场和船舰制造工场。兰代在物资委员会中勤勤恳恳地从事对军队和制造工场的供应工作。卡诺被称为“胜利组织者”是当之无愧的，但这是跟全体救国委员会的努力分不开的。所谓罗伯斯庇尔、圣茹斯特和库东没有参与妥善组织胜利的说法，纯系救国委员会的幸存者制造的热月党传说。这些幸存者企图把大恐怖的责任推到这些被抛弃的人身上，把保卫共和国安全的光荣归于自己。
Score: 0.81228936
---
ID: 3AOAf48BtYwLroEY7J0e
Doc Title: 革命政府的胜利和垮台（1793年12月—1794年7月）
Text:
救国委员会还管辖着临时行政会议的6位部长。1794年4月1日（共和二年芽月12日），按照卡诺向国民公会作的报告，这6位部长为12个执行委员会所取代。这些执行委员会均由国民公会根据救国委员会的举荐任命，它们完全听命于救国委员会，救国委员会则通过“控制政府的意图并就各项重大措施向国民公会提出建议”来保待自己的主导地位。
Score: 0.7818834
---
ID: rAOAf48BtYwLroEYhppP
Doc Title: 山岳派国民公会：人民运动和救国专政（1793年）
Text:
1793年夏天开始进行物资动员。当时什么都缺，仓库和军械库都空了。而军队的兵员7月份前就已增至6

