# RecDP LLM - RAG Indexer Pipeline (ElasticSearch)

This notebook will show you how to construct a RAG indexer pipeline with RecDP-LLM, it use Elasticsearch as backend document store

# Get started

## 1. Install pyrecdp and dependencies

In [2]:
! DEBIAN_FRONTEND=noninteractive apt-get install -qq -y openjdk-8-jre
! pip install -q pyrecdp --pre
# ! pip install 'git+https://github.com/intel/e2eAIOK.git#egg=pyrecdp&subdirectory=RecDP'

## 2. Setup and run RAG indexer pipeline

### 2.1 Loader documents

In [1]:
urls = ['https://app.cnvrg.io/docs/', 
        'https://app.cnvrg.io/docs/core_concepts/python_sdk_v2.html',
        'https://app.cnvrg.io/docs/cli_v2/cnvrgv2_cli.html',
        'https://app.cnvrg.io/docs/collections/tutorials.html']

from pyrecdp.primitives.operations import RecursiveUrlLoader
url_loader = RecursiveUrlLoader(urls, max_depth=2)

ds = url_loader.process_rayds()
display(ds.to_pandas())

[32m2023-12-19 16:28:23.864[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m52[0m - [1mcheck_availability_and_install emoji==2.2.0[0m


[32m2023-12-19 16:28:23.958[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m47[0m - [1mcheck_availability_and_install ['bs4', 'langchain'][0m


2023-12-19 16:28:50,843	INFO worker.py:1642 -- Started a local Ray instance.


Unnamed: 0,text,metadata
0,\n\n\n\n\nWelcome to cnvrg.io | cnvrg.io docs\...,{'description': 'Documentation website for cnv...
1,\n\n\n\n\nProjects | cnvrg.io docs\n\n\n\n\n\n...,{'description': 'Documentation website for cnv...
2,\n\n\n\n\nExperiments | cnvrg.io docs\n\n\n\n\...,{'description': 'Documentation website for cnv...
3,\n\n\n\n\nServing | cnvrg.io docs\n\n\n\n\n\n\...,{'description': 'Documentation website for cnv...
4,\n\n\n\n\nContainers | cnvrg.io docs\n\n\n\n\n...,{'description': 'Documentation website for cnv...
5,"\n\n\n\n\nOrganization, Account, and Team Sett...",{'description': 'Documentation website for cnv...
6,\n\n\n\n\nWorkspaces | cnvrg.io docs\n\n\n\n\n...,{'description': 'Documentation website for cnv...
7,\n\n\n\n\nCompute | cnvrg.io docs\n\n\n\n\n\n\...,{'description': 'Documentation website for cnv...
8,\n\n\n\n\nTutorials and Examples | cnvrg.io do...,{'description': 'Documentation website for cnv...
9,\n\n\n\n\nRelease Notes | cnvrg.io docs\n\n\n\...,{'description': 'Documentation website for cnv...


[2m[33m(raylet)[0m [2023-12-19 16:28:59,723 E 184744 184762] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_16-28-47_846155_183468 is over 95% full, available space: 482947072; capacity: 422146228224. Object creation will fail if spilling is required.


### 2.2 Preprocess documents

#### 2.2.1 Remove header separator and Extra white space

In [3]:
from pyrecdp.primitives.operations import RAGTextFix
text_fixer = RAGTextFix(str_to_replace={'\n###': '', '\n##': '', '\n#': ''}, remove_extra_whitespace=True)
ds = text_fixer.process_rayds(ds)
display(ds.to_pandas())


[32m2023-12-19 15:43:27.897[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m47[0m - [1mcheck_availability_and_install ['ftfy', 'selectolax'][0m
[32m2023-12-19 15:43:27.954[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m52[0m - [1mcheck_availability_and_install nltk[0m
[32m2023-12-19 15:43:28.263[0m | [1mINFO    [0m | [36mpyrecdp.core.model_utils[0m:[36mprepare_nltk_model[0m:[36m164[0m - [1mLoading nltk punkt split model...[0m


2023-12-19 15:43:28,273	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Map(<lambda>)]
2023-12-19 15:43:28,274	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-19 15:43:28,276	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

[2m[36m(Map(<lambda>) pid=109694)[0m 2023-12-19 15:43:30.292 | INFO     | pyrecdp.core.import_utils:check_availability_and_install:52 - check_availability_and_install emoji==2.2.0


Unnamed: 0,text,metadata
0,Flows | cnvrg.io docs cnvrg.io docs Tutorials...,{'description': 'Documentation website for cnv...
1,"Organization, Account, and Team Settings | cn...",{'description': 'Documentation website for cnv...
2,Experiments | cnvrg.io docs cnvrg.io docs Tut...,{'description': 'Documentation website for cnv...
3,Tutorials and Examples | cnvrg.io docs cnvrg....,{'description': 'Documentation website for cnv...
4,Datasets | cnvrg.io docs cnvrg.io docs Tutori...,{'description': 'Documentation website for cnv...
5,Distributed Jobs | cnvrg.io docs cnvrg.io doc...,{'description': 'Documentation website for cnv...
6,cnvrgv2 CLI | cnvrg.io docs cnvrg.io docs Tut...,{'description': 'Documentation website for cnv...
7,Videos | cnvrg.io docs cnvrg.io docs Tutorial...,{'description': 'Documentation website for cnv...
8,Workspaces | cnvrg.io docs cnvrg.io docs Tuto...,{'description': 'Documentation website for cnv...
9,Projects | cnvrg.io docs cnvrg.io docs Tutori...,{'description': 'Documentation website for cnv...


#### 2.2.2 Split document

In [4]:
from pyrecdp.primitives.operations import CustomerDocumentSplit
split_doc_op = CustomerDocumentSplit(func=lambda text: text.split('# ')[1:])
ds = split_doc_op.process_rayds(ds)
display(ds.to_pandas())

2023-12-19 15:43:33,682	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Map(<lambda>)->FlatMap(<lambda>)]
2023-12-19 15:43:33,683	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-19 15:43:33,684	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,text,metadata
0,Welcome to cnvrg.io cnvrg.io is a machine lear...,{'description': 'Documentation website for cnv...
1,"Resource Management With our Dashboard, get a ...",{'description': 'Documentation website for cnv...
2,"Apps cnvrg is a full-stack platform, designed ...",{'description': 'Documentation website for cnv...
3,R Shiny Shiny is an R package that simplifies ...,{'description': 'Documentation website for cnv...
4,Use R Shiny Complete the following steps to se...,{'description': 'Documentation website for cnv...
...,...,...
828,Tutorials and Examples To help you get started...,{'description': 'Documentation website for cnv...
829,Example Projects Build and Deploy an IMDB NLP ...,{'description': 'Documentation website for cnv...
830,"Workspaces, Experiments and IDEs Run an Experi...",{'description': 'Documentation website for cnv...
831,Flows and Serving Processing your Dataset with...,{'description': 'Documentation website for cnv...


[2m[33m(raylet)[0m [2023-12-19 15:43:34,260 E 109492 109513] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_15-37-41_942580_108415 is over 95% full, available space: 511860736; capacity: 422146228224. Object creation will fail if spilling is required.


#### 2.2.3 Add customer filter

In [5]:
def custom_filter(text):
    from nltk.tokenize import word_tokenize
    ret_txt = None
    if len(word_tokenize(text)) >10:
        if text.split(' ')[0].lower()!='version':
            ret_txt = text
    return ret_txt != None
    
from pyrecdp.primitives.operations import TextCustomerFilter
custom_filter_op = TextCustomerFilter(custom_filter)

ds = custom_filter_op.process_rayds(ds)
display(ds.to_pandas())


2023-12-19 15:43:36,846	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Map(<lambda>)->FlatMap(<lambda>)->Filter(<lambda>)]
2023-12-19 15:43:36,847	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-19 15:43:36,848	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,text,metadata
0,Welcome to cnvrg.io cnvrg.io is a machine lear...,{'description': 'Documentation website for cnv...
1,"Resource Management With our Dashboard, get a ...",{'description': 'Documentation website for cnv...
2,"Apps cnvrg is a full-stack platform, designed ...",{'description': 'Documentation website for cnv...
3,R Shiny Shiny is an R package that simplifies ...,{'description': 'Documentation website for cnv...
4,Use R Shiny Complete the following steps to se...,{'description': 'Documentation website for cnv...
...,...,...
570,Delete a registry To delete a registry from th...,{'description': 'Documentation website for cnv...
571,"Create an image To create an image, use the fo...",{'description': 'Documentation website for cnv...
572,Get an image To retrieve information about an ...,{'description': 'Documentation website for cnv...
573,when reffering to an image built from dockerfi...,{'description': 'Documentation website for cnv...


#### 2.2.4 chunk document

In [6]:
def chunk_doc(text,max_num_of_words):
    from nltk.tokenize import word_tokenize,sent_tokenize
    text= text.strip()
    if len(word_tokenize(text)) <= max_num_of_words:
        return [text]
    else:
        chunks = []
        # split by sentence
        sentences = sent_tokenize(text)
        # print('number of sentences: ', len(sentences))
        words_count = 0
        temp_chunk = ""
        for s in sentences:
            temp_chunk+=(s+" ")
            words_count += len(word_tokenize(s))
            if len(word_tokenize(temp_chunk))> max_num_of_words:
                chunks.append(temp_chunk)
                words_count = 0
                temp_chunk = ""
                
        return chunks
    
from pyrecdp.primitives.operations import CustomerDocumentSplit
chunk_doc_op = CustomerDocumentSplit(chunk_doc, max_num_of_words=50)

ds = chunk_doc_op.process_rayds(ds)
display(ds.to_pandas())

2023-12-19 15:43:40,101	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Map(<lambda>)->FlatMap(<lambda>)->Filter(<lambda>)->FlatMap(<lambda>)]
2023-12-19 15:43:40,102	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-19 15:43:40,103	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,text,metadata
0,Welcome to cnvrg.io cnvrg.io is a machine lear...,{'description': 'Documentation website for cnv...
1,Dataset Use Datasets to manage data with versi...,{'description': 'Documentation website for cnv...
2,Use Papers to consolidate comparison across ex...,{'description': 'Documentation website for cnv...
3,"Resource Management With our Dashboard, get a ...",{'description': 'Documentation website for cnv...
4,"Apps cnvrg is a full-stack platform, designed ...",{'description': 'Documentation website for cnv...
...,...,...
900,Delete a registry To delete a registry from th...,{'description': 'Documentation website for cnv...
901,"Create an image To create an image, use the fo...",{'description': 'Documentation website for cnv...
902,Get an image To retrieve information about an ...,{'description': 'Documentation website for cnv...
903,when reffering to an image built from dockerfi...,{'description': 'Documentation website for cnv...


[2m[33m(raylet)[0m [2023-12-19 15:43:44,274 E 109492 109513] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_15-37-41_942580_108415 is over 95% full, available space: 511811584; capacity: 422146228224. Object creation will fail if spilling is required.


### 2.3 Store documents 



#### 2.3.1 Setup ElasticSearch
You'll need to install Docker Engine on your development system. Note that while Docker Engine is free to use, Docker Desktop may require you to purchase a license. See the [Docker Engine Server installation instructions](https://docs.docker.com/engine/install/#server) for details.

In [6]:
!docker run -p 127.0.0.1:9200:9200 -p 127.0.0.1:9300:9300 -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:7.17.15

docker: Error response from daemon: driver failed programming external connectivity on endpoint admiring_meitner (af5f1498f190e509046ff0a13aa9a6254e94bbb3e8b6cf7e2a9f9df5d6ae882f): Bind for 127.0.0.1:9300 failed: port is already allocated.
[31mERRO[0m[0000] error waiting for container:                 


[2m[33m(raylet)[0m [2023-12-19 14:39:30,703 E 4182447 4182465] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_14-38-38_780920_4181242 is over 95% full, available space: 563916800; capacity: 422146228224. Object creation will fail if spilling is required.


In [1]:
! curl -X GET "localhost:9200"

{
  "name" : "ffeee69fabad",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "QJVbm1Z1RAq7XNnxFdVMeA",
  "version" : {
    "number" : "7.17.15",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "0b8ecfb4378335f4689c4223d1f1115f16bef3ba",
    "build_date" : "2023-11-10T22:03:46.987399016Z",
    "build_snapshot" : false,
    "lucene_version" : "8.11.1",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


#### 2.3.2 Ingest documents into document store

In [7]:
import ray
from typing import Dict
import numpy as np

def add_dog_years(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
                    batch["age_in_dog_years"] = 7 * batch["age"]
                    return batch

ds = (
    ray.data.from_items([
        {"name": "Luna", "age": 4},
        {"name": "Rory", "age": 14},
        {"name": "Scout", "age": 9},
    ])
    .map_batches(add_dog_years)
)
ds.show()


2023-12-19 15:43:49,797	INFO dataset.py:2380 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2023-12-19 15:43:49,801	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(add_dog_years)] -> LimitOperator[limit=20]
2023-12-19 15:43:49,803	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-19 15:43:49,804	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

{'name': 'Luna', 'age': 4, 'age_in_dog_years': 28}
{'name': 'Rory', 'age': 14, 'age_in_dog_years': 98}
{'name': 'Scout', 'age': 9, 'age_in_dog_years': 63}


[2m[33m(raylet)[0m [2023-12-19 15:43:54,288 E 109492 109513] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_15-37-41_942580_108415 is over 95% full, available space: 519995392; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-19 15:44:04,302 E 109492 109513] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_15-37-41_942580_108415 is over 95% full, available space: 519962624; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-19 15:44:14,317 E 109492 109513] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_15-37-41_942580_108415 is over 95% full, available space: 519491584; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-19 15:44:24,331 E 109492 109513] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_15-37-41_942580_108415 is over 95% full, available s