In [1]:
import os
import re
from elasticsearch import Elasticsearch, helpers
from getpass import getpass
from langchain.embeddings import HuggingFaceEmbeddings

os.environ["RABBITMQ_HOST"] = "localhost"

from celery_tasks import ingest_data

CHUNK_SIZE = 400
ES_CHUNK_SIZE = 50
INDEX_NAME = "es_french_revo_idx"
MODEL_ID = "BAAI/bge-large-zh-v1.5"
MODEL_ID_ES = "baai__bge-large-zh-v1.5"
MODEL_DIM = 1024
MODEL_SIMILARITY = "cosine"

ES_HOST = "https://localhost:9200/"
ES_PASS = "y5AADXZR0l63CvTz1AsWznNiAM1Ukq7KSd3MEra"
# ES_PASS = getpass("ElasticSearch Password: ")
# COHERE_API_KEY = getpass("Elastic Api Key: ")

  _transport = transport_class(


In [2]:
!eland_import_hub_model \
    -u elastic -p $ES_PASS \
    --url $ES_HOST \
    --hub-model-id $MODEL_ID \
    --task-type text_embedding \
    --insecure \
    --clear-previous \
    --start

2024-05-16 01:50:26,074 INFO : Establishing connection to Elasticsearch
  _transport = transport_class(
2024-05-16 01:50:26,104 INFO : Connected to cluster named 'docker-cluster' (version: 8.13.4)
2024-05-16 01:50:26,105 INFO : Loading HuggingFace transformer tokenizer and model 'BAAI/bge-large-zh-v1.5'
Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
STAGE:2024-05-16 01:50:36 6600:6600 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-05-16 01:50:38 6600:6600 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-05-16 01:50:38 6600:6600 ActivityProfilerController.cpp:322] Completed Stage: Post Processing
2024-05-16 01:50:45,752 INFO : Stopping deployment for model with id 'baai__bge-large-zh-v1.5'
2024-05-16 01:50:46,030 INFO : Deleting model with id 'baai__bge-large-zh-v1.5'
2024-05-16 01:50:54,095 INFO : Creating model with id 'baai__bge-large-zh-v1.5'
2024-

In [4]:
# Create the client instance
client = Elasticsearch(
    # For local development
    hosts=[ES_HOST],
    basic_auth=('elastic', ES_PASS), 
    verify_certs=False
)
print(client.info())

{'name': 'es01', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'i4zHr5YQTMik8-cO0UYiaw', 'version': {'number': '8.13.4', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'da95df118650b55a500dcc181889ac35c6d8da7c', 'build_date': '2024-05-06T22:04:45.107454559Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}




In [84]:
client.indices.delete(index=INDEX_NAME, ignore_unavailable=True)

# Setup the index
client.indices.create(
    index=INDEX_NAME,
    mappings={
        "dynamic": "true",
        "properties": {
            "passages": {
                "type": "nested",
                "properties": {
                    "vector": {
                        "properties": {
                            "predicted_value": {
                                "type": "dense_vector",
                                "index": True,
                                "dims": MODEL_DIM,
                                "similarity": MODEL_SIMILARITY,
                            }
                        }
                    }
                },
            }
        },
    },
)



ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'es_french_revo_idx'})

## Add some Documents through Celery

Now we can add documents with large amounts of text in body_content and automatically have them chunked, and each chunk text embedded into vectors by the model:

In [85]:
#Read MD File
def read_MD(md_file):
    f = open(md_file, 'r')
    docs = f.read()
    # 删除 markdown 标记
    docs = re.sub('#+ |\*+|_+|\> |\[\^[0-9]+\]|: ', '', docs)
    docs = re.sub('\n *[0-9]+\. +|\n- ', '\n', docs)
    # 按自然段分行
    docs = re.split('\n\n---\n\n|\n\n|\n', docs)
    # 删除空字符串
    docs = list(filter(lambda doc: len(doc) > 0, docs))
    title = docs[0]
    return {
        "title": title,
        "docs": docs[1:]
    }

In [86]:
root_directory = '../french_revo'
directories = [x[0] for x in os.walk(root_directory) if '.git' not in x[0]]
docs = []


for directory in directories[1:]:
    md_files = [f"{directory}/{md_file}" for md_file in os.listdir(directory)]
    for md_file in md_files:
        read_md = read_MD(md_file)
        docs += [{"text": doc, "title": read_md["title"], "file": md_file, "_index": INDEX_NAME} for _i, doc in enumerate(read_md["docs"])]

print(len(docs))
print(docs[0])

6098
{'text': '1793年9月5日将近正午时，巴黎市政厅前的广场上聚起了一大群人，他们准备去国民公会前游行。就在此时，巴黎和整个欧洲北部地区上空都陷入了黑暗之中。由于日食，从巴黎看去，太阳的四分之三区域在11点47分时就开始慢慢丧失光芒。而再往北走，就是日全食了。法国首都的人们就是在这样古怪的昏暗天色里准备他们的暴动的。这个打乱他们日程的变故让众人一片哗然，大家纷纷拿这事说笑。', 'title': '组织恐怖', 'file': '../french_revo/12_who_ruled/12_who_ruled_chapter03.md', '_index': 'es_french_revo_idx'}


In [87]:
# Add the documents to the index directly
for i in range(0, len(docs), ES_CHUNK_SIZE):
    ingest_data.apply_async(
        kwargs={
            "docs": docs[i: min(i + ES_CHUNK_SIZE, len(docs))]
        }
    )

### Aside: Pretty printing Elasticsearch responses

Your API calls will return hard-to-read nested JSON. We'll create a little function called pretty_response to return nice, human-readable outputs from our examples.

In [9]:
def pretty_response(response):
    if len(response["hits"]["hits"]) == 0:
        print("Your search returned no results.")
    else:
        for hit in response["hits"]["hits"]:
            id = hit["_id"]
            score = hit["_score"]
            doc_title = hit["_source"]["title"]
            passage_text = ""

            for passage in hit["inner_hits"]["passages"]["hits"]["hits"]:
                passage_text += passage["fields"]["passages"][0]["text"][0] + "\n\n"
            text = hit["_source"]["text"]

            pretty_output = f"ID: {id}\nDoc Title: {doc_title}\nText:\n{text}\nScore: {score}"
            print(pretty_output)
            print("---")

1. 下载ES数据上传到云
1. 写一个 OpenAI chatbot
1. 部署到云上
1. 写个视频

In [6]:
response = client.search(
    index=INDEX_NAME,
    knn={
        "inner_hits": {"size": 1, "_source": False, "fields": ["passages.text"]},
        "field": "passages.vector.predicted_value",
        "k": 20,
        "num_candidates": 100,
        "query_vector_builder": {
            "text_embedding": {
                "model_id": MODEL_ID_ES,
                "model_text": "资产阶级由什么样的人群构成？",
            }
        },
    },
)

[hit["_source"]["text"] for hit in response["hits"]["hits"]]



['资产阶级的组成成分远不是那么整齐划一。严格地说，自认为是资产者的那些人无非是少数发了财的平民，他们可以不用做工而靠自己的财产过着贵族式的生活，这些财产或者是土地，或者是年金，少数是动产。他们勉强能接受与以下两个集团的成员为伍，但要求这些人必须同样有钱，而且毫无例外地不从事体力劳动，只担任官职或领导职务。',
 '资产阶级是第三等级中占优势的阶级：它领导了大革命，并从中得到好处。它凭借财富和文化占据了社会头等位置，然而这种地位与特权等级的正式地位相抵触。根据社会地位和在经济生活中的作用，可以把资产阶级划分为不同类型：严格意义上的“资产阶级”，是以本金利润或地产收入为生的食利者；以法律界人士、官吏等组成的自由职业型资产阶级，是一个多样化与复杂的资产阶级类型；手工业者和店主是与传统的生产和交换体系紧密联系的中、小资产阶级；工商业大资产阶级直接以利润为生，该类型十分活跃，是整个资产阶级的推动力量。',
 '另一个集团包括金融家和实业家，他们的名望较低，却往往更加有钱。包税人、王家采办和王家供奉等为国家效力的金融家地位较高，有的甚至晋升贵族，信奉新教的外国人内克尔被提拔当了大臣。造船主、批发商、制造商的人数虽多，但势力不大。在某些城市中，他们以商会或商事法院为依托，制造商也加入本行业的行会。旧制度下的这个资产阶级就是我们所说的大资产阶级。他们同贵族一样，只占居民中的极少数。行会组织在排外性方面与贵族完全相同。库尔诺写道，一个集团对另一个集团的“种种蔑视”阻碍了团结；而每个资产者历来都梦想跻于上等阶级的行列。在法国，他们的地位如今已上升到这样的程度，以致政府开始把他们同仍然富有的贵族一起统称为“缙绅”，这个由金钱创造的和打破了门第隔阂的社会范畴已经构成现代的资产阶级。',
 '可见，资产阶级只是丧失了自己的部分代表，包括一些杰出的代表在内，阶级的内在结构也有所变化。不从事生产而依靠官职或祖业为生的那部分资产者，如年金收入者、旧军官、国家官吏、法律界人士等，虽然他们的地位并未真正动摇，但已不如旧制度下那么富裕。关于这一点，我们今天可以看得更加清楚。他们从思想上为大革命作了准备，提出了革命的原则，充当了革命的领导，最后却发现自己的领导地位逐渐被操纵经济活动的新富人所取代，因为这些新富人正购置地产，在缙绅中扩大影响和提高声望。从另方面看，资产阶级的地位也随着新富人加入自己的行列而