# RecDP LLM - RAG

# Get started

## 1. Install pyrecdp and dependencies

In [None]:
! DEBIAN_FRONTEND=noninteractive apt-get install -y openjdk-8-jre
! pip install -q pyrecdp --pre
# ! pip install 'git+https://github.com/intel/e2eAIOK.git#egg=pyrecdp&subdirectory=RecDP'

## 2. RAG Workflow

Setup and Run RAG Indexer Pipeline

In [2]:
from pyrecdp.primitives.operations import RecursiveUrlLoader,RAGTextFix,CustomerDocumentSplit,TextCustomerFilter,DocumentIngestion
from pyrecdp.LLM import TextPipeline
import os

urls = ['https://app.cnvrg.io/docs/', 
        'https://app.cnvrg.io/docs/core_concepts/python_sdk_v2.html',
        'https://app.cnvrg.io/docs/cli_v2/cnvrgv2_cli.html',
        'https://app.cnvrg.io/docs/collections/tutorials.html']

def custom_filter(text):
    from nltk.tokenize import word_tokenize
    ret_txt = None
    if len(word_tokenize(text)) >10:
        if text.split(' ')[0].lower()!='version':
            ret_txt = text
    return ret_txt != None

def chunk_doc(text,max_num_of_words):
    from nltk.tokenize import word_tokenize,sent_tokenize
    text= text.strip()
    if len(word_tokenize(text)) <= max_num_of_words:
        return [text]
    else:
        chunks = []
        # split by sentence
        sentences = sent_tokenize(text)
        # print('number of sentences: ', len(sentences))
        words_count = 0
        temp_chunk = ""
        for s in sentences:
            temp_chunk+=(s+" ")
            words_count += len(word_tokenize(s))
            if len(word_tokenize(temp_chunk))> max_num_of_words:
                chunks.append(temp_chunk)
                words_count = 0
                temp_chunk = ""
                
        return chunks

pipeline = TextPipeline()
ops = [
    RecursiveUrlLoader(urls, max_depth=2),
    RAGTextFix(str_to_replace={'\n###': '', '\n##': '', '\n#': ''}, remove_extra_whitespace=True),
    CustomerDocumentSplit(func=lambda text: text.split('# ')[1:]),
    TextCustomerFilter(custom_filter),
    CustomerDocumentSplit(func=chunk_doc, max_num_of_words=50),
    DocumentIngestion(
        rag_framework='haystack',
        vector_store='elasticsearch',
        vector_store_args={'host': 'localhost', 'port': 9200}
    )
]
pipeline.add_operations(ops)
ds = pipeline.execute()
display(ds.to_pandas())


[32m2023-12-19 16:55:34.829[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m47[0m - [1mcheck_availability_and_install ['bs4', 'langchain'][0m
[32m2023-12-19 16:55:34.833[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m47[0m - [1mcheck_availability_and_install ['ftfy', 'selectolax'][0m
[32m2023-12-19 16:55:34.836[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m47[0m - [1mcheck_availability_and_install ['langchain'][0m


[32m2023-12-19 16:55:34.894[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m47[0m - [1mcheck_availability_and_install ['bs4', 'langchain'][0m
[32m2023-12-19 16:55:34.897[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m47[0m - [1mcheck_availability_and_install ['ftfy', 'selectolax'][0m
[32m2023-12-19 16:55:34.900[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m47[0m - [1mcheck_availability_and_install ['langchain'][0m
init ray
execute with ray started ...


[2m[33m(raylet)[0m [2023-12-19 16:55:38,001 E 241023 241042] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_16-52-05_854938_240267 is over 95% full, available space: 456495104; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-19 16:55:48,016 E 241023 241042] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_16-52-05_854938_240267 is over 95% full, available space: 456441856; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-19 16:55:58,028 E 241023 241042] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_16-52-05_854938_240267 is over 95% full, available space: 456417280; capacity: 422146228224. Object creation will fail if spilling is required.


[32m2023-12-19 16:56:00.001[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m47[0m - [1mcheck_availability_and_install ['farm-haystack', 'farm-haystack[elasticsearch7]'][0m


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


[2m[33m(raylet)[0m [2023-12-19 16:56:08,042 E 241023 241042] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_16-52-05_854938_240267 is over 95% full, available space: 456359936; capacity: 422146228224. Object creation will fail if spilling is required.
Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


[2m[33m(raylet)[0m [2023-12-19 16:56:18,054 E 241023 241042] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_16-52-05_854938_240267 is over 95% full, available space: 455839744; capacity: 422146228224. Object creation will fail if spilling is required.
2023-12-19 16:56:21,436	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Map(<lambda>)->FlatMap(<lambda>)->Filter(<lambda>)->FlatMap(<lambda>)->MapBatches(<lambda>)]
2023-12-19 16:56:21,438	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-19 16:56:21,439	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

2023-12-19 16:56:26,361	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Map(<lambda>)->FlatMap(<lambda>)->Filter(<lambda>)->FlatMap(<lambda>)]
2023-12-19 16:56:26,363	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-19 16:56:26,366	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/20 [00:00<?, ?it/s]

execute with ray took 51.880861999467015 sec


Unnamed: 0,text,metadata
0,Welcome to cnvrg.io cnvrg.io is a machine lear...,{'description': 'Documentation website for cnv...
1,Dataset Use Datasets to manage data with versi...,{'description': 'Documentation website for cnv...
2,Use Papers to consolidate comparison across ex...,{'description': 'Documentation website for cnv...
3,"Resource Management With our Dashboard, get a ...",{'description': 'Documentation website for cnv...
4,AI Library Continual learning and building mac...,{'description': 'Documentation website for cnv...
...,...,...
900,Delete a registry To delete a registry from th...,{'description': 'Documentation website for cnv...
901,"Create an image To create an image, use the fo...",{'description': 'Documentation website for cnv...
902,Get an image To retrieve information about an ...,{'description': 'Documentation website for cnv...
903,when reffering to an image built from dockerfi...,{'description': 'Documentation website for cnv...


[2m[33m(raylet)[0m [2023-12-19 16:56:28,069 E 241023 241042] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_16-52-05_854938_240267 is over 95% full, available space: 454561792; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-19 16:56:38,082 E 241023 241042] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_16-52-05_854938_240267 is over 95% full, available space: 454459392; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-19 16:56:48,096 E 241023 241042] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_16-52-05_854938_240267 is over 95% full, available space: 454348800; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-19 16:56:58,109 E 241023 241042] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-19_16-52-05_854938_240267 is over 95% full, available s