In [None]:
import os
import re
from elasticsearch import Elasticsearch

os.environ["RABBITMQ_HOST"] = "localhost"

from celery_tasks import ingest_data

CHUNK_SIZE = 400
ES_CHUNK_SIZE = 50
INDEX_NAME = "es_french_revo_idx"
MODEL_ID = "BAAI/bge-large-zh-v1.5"
MODEL_ID_ES = "baai__bge-large-zh-v1.5"
MODEL_DIM = 1024
MODEL_SIMILARITY = "cosine"

ES_HOST = "https://localhost:9200/"
ES_PASS = "y5AADXZR0l63CvTz1AsWznNiAM1Ukq7KSd3MEra"
# ES_PASS = getpass("ElasticSearch Password: ")
# COHERE_API_KEY = getpass("Elastic Api Key: ")

In [None]:
!eland_import_hub_model \
    -u elastic -p $ES_PASS \
    --url $ES_HOST \
    --hub-model-id $MODEL_ID \
    --task-type text_embedding \
    --insecure \
    --clear-previous \
    --start

In [None]:
# Create the client instance
client = Elasticsearch(
    # For local development
    hosts=[ES_HOST],
    basic_auth=('elastic', ES_PASS), 
    verify_certs=False
)
print(client.info())

In [None]:
client.indices.delete(index=INDEX_NAME, ignore_unavailable=True)

# Setup the index
client.indices.create(
    index=INDEX_NAME,
    mappings={
        "dynamic": "true",
        "properties": {
            "passages": {
                "type": "nested",
                "properties": {
                    "vector": {
                        "properties": {
                            "predicted_value": {
                                "type": "dense_vector",
                                "index": True,
                                "dims": MODEL_DIM,
                                "similarity": MODEL_SIMILARITY,
                            }
                        }
                    }
                },
            }
        },
    },
)

## Add some Documents through Celery

Now we can add documents with large amounts of text in body_content and automatically have them chunked, and each chunk text embedded into vectors by the model:

In [None]:
#Read MD File
def read_MD(md_file):
    f = open(md_file, 'r')
    docs = f.read()
    # 删除 markdown 标记
    docs = re.sub('#+ |\*+|_+|\> |\[\^[0-9]+\]|: ', '', docs)
    docs = re.sub('\n *[0-9]+\. +|\n- ', '\n', docs)
    # 按自然段分行
    docs = re.split('\n\n---\n\n|\n\n|\n', docs)
    # 删除空字符串
    docs = list(filter(lambda doc: len(doc) > 0, docs))
    title = docs[0]
    return {
        "title": title,
        "docs": docs[1:]
    }

In [None]:
root_directory = '../french_revo'
directories = [x[0] for x in os.walk(root_directory) if '.git' not in x[0]]
docs = []


for directory in directories[1:]:
    md_files = [f"{directory}/{md_file}" for md_file in os.listdir(directory)]
    for md_file in md_files:
        read_md = read_MD(md_file)
        docs += [{"text": doc, "title": read_md["title"], "file": md_file, "_index": INDEX_NAME} for _i, doc in enumerate(read_md["docs"])]

print(len(docs))
print(docs[0])

In [None]:
# Add the documents to the index directly
for i in range(0, len(docs), ES_CHUNK_SIZE):
    ingest_data.apply_async(
        kwargs={
            "docs": docs[i: min(i + ES_CHUNK_SIZE, len(docs))]
        }
    )

### Aside: Pretty printing Elasticsearch responses

Your API calls will return hard-to-read nested JSON. We'll create a little function called pretty_response to return nice, human-readable outputs from our examples.

In [None]:
def pretty_response(response):
    if len(response["hits"]["hits"]) == 0:
        print("Your search returned no results.")
    else:
        for hit in response["hits"]["hits"]:
            id = hit["_id"]
            score = hit["_score"]
            doc_title = hit["_source"]["title"]
            passage_text = ""

            for passage in hit["inner_hits"]["passages"]["hits"]["hits"]:
                passage_text += passage["fields"]["passages"][0]["text"][0] + "\n\n"
            text = hit["_source"]["text"]

            pretty_output = f"ID: {id}\nDoc Title: {doc_title}\nText:\n{text}\nScore: {score}"
            print(pretty_output)
            print("---")

In [None]:
response = client.search(
    index=INDEX_NAME,
    knn={
        "inner_hits": {"size": 1, "_source": False, "fields": ["passages.text"]},
        "field": "passages.vector.predicted_value",
        "k": 20,
        "num_candidates": 100,
        "query_vector_builder": {
            "text_embedding": {
                "model_id": MODEL_ID_ES,
                "model_text": "资产阶级由什么样的人群构成？",
            }
        },
    },
)

[hit["_source"]["text"] for hit in response["hits"]["hits"]]