In [27]:
import os
import re
from elasticsearch import Elasticsearch
from getpass import getpass

os.environ["RABBITMQ_HOST"] = "localhost"

from celery_tasks import ingest_data

CHUNK_SIZE = 400
ES_CHUNK_SIZE = 1000
INDEX_NAME = "es_french_revo_idx"
MODEL_ID = "BAAI/bge-large-zh-v1.5"
MODEL_ID_ES = "BAAI__bge-large-zh-v1.5"
MODEL_DIM = 1024
MODEL_SIMILARITY = "cosine"

ES_HOST = "https://localhost:9200/"
ES_PASS = "y5AADXZR0l63CvTz1AsWznNiAM1Ukq7KSd3MEra"
# ES_PASS = getpass("ElasticSearch Password: ")
# COHERE_API_KEY = getpass("Elastic Api Key: ")

In [2]:
import torch
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2070 with Max-Q Design'

In [3]:
# Create the client instance
client = Elasticsearch(
    # For local development
    hosts=[ES_HOST],
    basic_auth=('elastic', ES_PASS), 
    verify_certs=False
)
print(client.info())

{'name': 'es01', 'cluster_name': 'docker-cluster', 'cluster_uuid': '2d2hAdh2RK2B7QCE92Kbdw', 'version': {'number': '8.13.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '09df99393193b2c53d92899662a8b8b3c55b45cd', 'build_date': '2024-03-22T03:35:46.757803203Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


  _transport = transport_class(


### Load Model from hugging face

The first thing you will need is a model to create the text embeddings out of the chunks, you can use whatever you would like, but this example will run end to end on the minilm-l6-v2 model. With an Elastic Cloud cluster created or another Elasticsearch cluster ready, we can upload the text embedding model using the eland library.

In [4]:
!eland_import_hub_model \
    -u elastic -p $ES_PASS \
    --url $ES_HOST \
    --hub-model-id $MODEL_ID \
    --task-type text_embedding \
    --insecure \
    --clear-previous \
    --start

2024-05-14 08:46:21,579 INFO : Establishing connection to Elasticsearch
  _transport = transport_class(
2024-05-14 08:46:21,607 INFO : Connected to cluster named 'docker-cluster' (version: 8.13.0)
2024-05-14 08:46:21,607 INFO : Loading HuggingFace transformer tokenizer and model 'BAAI/bge-large-zh-v1.5'
Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
STAGE:2024-05-14 08:46:31 95690:95690 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-05-14 08:46:33 95690:95690 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-05-14 08:46:33 95690:95690 ActivityProfilerController.cpp:322] Completed Stage: Post Processing
2024-05-14 08:46:39,255 INFO : Stopping deployment for model with id 'baai__bge-large-zh-v1.5'
2024-05-14 08:46:39,488 INFO : Deleting model with id 'baai__bge-large-zh-v1.5'
2024-05-14 08:46:47,744 INFO : Creating model with id 'baai__bge-large-zh-v1.5'

In [5]:
# Setup the pipeline
client.ingest.put_pipeline(
    id="chunk_text_to_passages",
    processors=[
        {
            "script": {
                "description": "Chunk body_content into sentences by looking for . followed by a space",
                "lang": "painless",
                "source": """
          String[] envSplit = /((?<=。) |(?<=？) |(?<=！) |(?<=…+) )/.split(ctx['text']);
          ctx['passages'] = new ArrayList();
          int i = 0;
          boolean remaining = true;
          if (envSplit.length == 0) {
            return
          } else if (envSplit.length == 1) {
            Map passage = ['text': envSplit[0]];ctx['passages'].add(passage)
          } else {
            while (remaining) {
              Map passage = ['text': envSplit[i++]];
              while (i < envSplit.length && passage.text.length() + envSplit[i].length() < params.model_limit) {passage.text = passage.text + ' ' + envSplit[i++]}
              if (i == envSplit.length) {remaining = false}
              ctx['passages'].add(passage)
            }
          }
          """,
                "params": {"model_limit": CHUNK_SIZE},
            }
        },
        {
            "foreach": {
                "field": "passages",
                "processor": {
                    "inference": {
                        "model_id": MODEL_ID_ES,
                        "input_output": [
                            { 
                                "input_field": "_ingest._value.text",
                                "output_field": "_ingest._value.vector.predicted_value"
                            }
                        ],
                        "on_failure": [
                            {
                                "append": {
                                    "field": "_source._ingest.inference_errors",
                                    "value": [
                                        {
                                            "message": "Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'",
                                            "pipeline": "ml-inference-title-vector",
                                            "timestamp": "{{{ _ingest.timestamp }}}",
                                        }
                                    ],
                                }
                            }
                        ],
                    }
                },
            }
        },
    ],
)



ObjectApiResponse({'acknowledged': True})

### Setup Index

Next step is to prepare the mappings to handle the array of sentences and vector objects that will be created during the ingest pipeline. For this particular text embedding model the dimensions are 384 and dot_product similarity will be used for nearest neighbor calculations:

In [9]:
client.indices.delete(index=INDEX_NAME, ignore_unavailable=True)

# Setup the index
client.indices.create(
    index=INDEX_NAME,
    settings={"index": {"default_pipeline": "chunk_text_to_passages"}},
    mappings={
        "dynamic": "true",
        "properties": {
            "passages": {
                "type": "nested",
                "properties": {
                    "vector": {
                        "properties": {
                            "predicted_value": {
                                "type": "dense_vector",
                                "index": True,
                                "dims": MODEL_DIM,
                                "similarity": MODEL_SIMILARITY,
                            }
                        }
                    }
                },
            }
        },
    },
)



ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'es_french_revo_idx'})

## Add some Documents through Celery

Now we can add documents with large amounts of text in body_content and automatically have them chunked, and each chunk text embedded into vectors by the model:

In [32]:
file = '../french_revo/12_who_ruled/12_who_ruled_note_bibliographical.md'

#Read MD File
def read_MD(md_file):
    f = open(md_file, 'r')
    docs = f.read()
    # 删除 markdown 标记
    docs = re.sub('#+ |\*+|_+|\> |\[\^[0-9]+\]|: ', '', docs)
    docs = re.sub('\n *[0-9]+\. +|\n- ', '\n', docs)
    # 按自然段分行
    docs = re.split('\n\n---\n\n|\n\n|\n', docs)
    # 删除空字符串
    docs = list(filter(lambda doc: len(doc) > 0, docs))
    return docs
    

read_MD(file)

['参考书目',
 '由于1939-1940年间，对外国学者而言，查阅法国本土的档案纪录极为困难，本书完全依靠已出版的作品写作。幸运的是，关于法国大革命，已经出版的作品足够丰富，可以在法国以外的大型研究图书馆里借阅到。我把这些作品分为三类：',
 '革命时期的出版物，如期刊、书籍、小册子、国民公会中讲话的复印',
 '革命后在国家档案馆和其他档案馆里的档案汇编',
 '19和20世纪学者的传记、论著、专题论文，经常也包括1790年代的材料。',
 '此外，还有对共和二年（大约可等同于1793年7月到1794年7月）中救国委员会举措的评论，这其中包括很多在写作本书时还没有发表的评论。对1940年以前参考书目的详细内容，可以参考我的"Bibliographical ArticleFifty Years of the Committee of Public Safety", Journal of Modern History 13 (1941), pp. 375-397.',
 '如果读者想要阅读更晚近的，概述法国大革命的英文著作，可以参考D. M. G. Sutherland, Oxford 1986；John M. Roberts, Oxford 1978；N. Hampson, London 1975；还有由Albert Soboul的法文著作译为英文的London 1974，由Frangois Furet和Daniel Richet著作译为英文的, London and New York 1970。直接讨论救国委员会的著作较少，只有一些法文口袋书，"Que sais-je?", by Marc Bouloiseau, Le comite de salut public, 1793-1795, Paris 1968. 也可参考Bouloiseau在1972年的著作，译名为 The Jacobin Republic, 1792-1794, Cambridge and New York 1983。还有M. J. Sydenham的 The First French Republic, London 1974.',
 "此前提到的第一类内容，卷帙浩繁，浩如烟海，无法一一列举。大革命本身就是政治讨论的一次大爆发。无数出版物，如Hebert的 Pere Duchesne，还有

### Aside: Pretty printing Elasticsearch responses

Your API calls will return hard-to-read nested JSON. We'll create a little function called pretty_response to return nice, human-readable outputs from our examples.

In [10]:
def pretty_response(response):
    if len(response["hits"]["hits"]) == 0:
        print("Your search returned no results.")
    else:
        for hit in response["hits"]["hits"]:
            id = hit["_id"]
            score = hit["_score"]
            doc_title = hit["_source"]["name"]
            passage_text = ""

            for passage in hit["inner_hits"]["passages"]["hits"]["hits"]:
                passage_text += passage["fields"]["passages"][0]["text"][0] + "\n\n"

            pretty_output = f"\nID: {id}\nDoc Title: {doc_title}\nPassage Text:\n{passage_text}\nScore: {score}\n"
            print(pretty_output)
            print("---")

### Making queries

To search the data and return what chunk matched the query best you use inner_hits with the knn clause to return just that best matching chunk of the document in the hits output from the query.

Below you will see the response which returns the best document and the most relevant passage.

In [11]:
response = client.search(
    index=INDEX_NAME,
    knn={
        "inner_hits": {"size": 1, "_source": False, "fields": ["passages.text"]},
        "field": "passages.vector.predicted_value",
        "k": 20,
        "num_candidates": 100,
        "query_vector_builder": {
            "text_embedding": {
                "model_id": MODEL_ID_ES,
                "model_text": "罗伯斯庇尔",
            }
        },
    },
)

pretty_response(response)


ID: 0
Doc Title: Work From Home Policy
Passage Text:
Effective: March 2020
Purpose

The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.
Scope

This policy applies to all employees who are eligible for remote work as determined by their role and responsibilities.


Score: 0.854961

---

ID: 7
Doc Title: Intellectual Property Policy
Passage Text:
This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.

Scope
This policy applies to all employees, including full-time, part-time, temporary, and contract employees.

Definitions
a.


Score: 0.7664343

---

ID: 4
Doc Title: Company Vacation Policy
Passage Text:
Purpose

The purpose of this vacation policy is to outline the guidelines and procedures for requesting and takin

