# RecDP LLM - RAG

# Get started

## 1. Install pyrecdp and dependencies

In [None]:
! DEBIAN_FRONTEND=noninteractive apt-get install -y openjdk-8-jre
! pip install -q pyrecdp --pre
# ! pip install 'git+https://github.com/intel/e2eAIOK.git#egg=pyrecdp&subdirectory=RecDP'

## 2. RAG Workflow

Setup and Run RAG Indexer Pipeline

In [10]:
from pyrecdp.primitives.operations import DocumentLoader,RAGTextFix,CustomerDocumentSplit,TextCustomerFilter,DocumentIngestion
from pyrecdp.LLM import TextPipeline
import os

urls = ['https://app.cnvrg.io/docs/', 
        'https://app.cnvrg.io/docs/core_concepts/python_sdk_v2.html',
        'https://app.cnvrg.io/docs/cli_v2/cnvrgv2_cli.html',
        'https://app.cnvrg.io/docs/collections/tutorials.html']

def custom_filter(text):
    from nltk.tokenize import word_tokenize
    ret_txt = None
    if len(word_tokenize(text)) >10:
        if text.split(' ')[0].lower()!='version':
            ret_txt = text
    return ret_txt != None

def chunk_doc(text,max_num_of_words):
    from nltk.tokenize import word_tokenize,sent_tokenize
    text= text.strip()
    if len(word_tokenize(text)) <= max_num_of_words:
        return [text]
    else:
        chunks = []
        # split by sentence
        sentences = sent_tokenize(text)
        # print('number of sentences: ', len(sentences))
        words_count = 0
        temp_chunk = ""
        for s in sentences:
            temp_chunk+=(s+" ")
            words_count += len(word_tokenize(s))
            if len(word_tokenize(temp_chunk))> max_num_of_words:
                chunks.append(temp_chunk)
                words_count = 0
                temp_chunk = ""
                
        return chunks

pipeline = TextPipeline()
ops = [
    DocumentLoader(loader='UnstructuredURLLoader', loader_args={'urls': urls}, requirements=['unstructured']),
    RAGTextFix(str_to_replace={'\n###': '', '\n##': '', '\n#': ''}, remove_extra_whitespace=True),
    CustomerDocumentSplit(func=lambda text: text.split('# ')[1:]),
    TextCustomerFilter(custom_filter),
    CustomerDocumentSplit(func=chunk_doc, max_num_of_words=50),
    DocumentIngestion(
        rag_framework='haystack',
        vector_store='elasticsearch',
        vector_store_args={'host': 'localhost', 'port': 9200}
    )
]
pipeline.add_operations(ops)
ds = pipeline.execute()
display(ds.to_pandas())


[32m2023-12-13 11:30:37.277[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m45[0m - [1mcheck_availability_and_install ['unstructured'][0m
[32m2023-12-13 11:30:37.281[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m45[0m - [1mcheck_availability_and_install ['ftfy', 'selectolax'][0m
[32m2023-12-13 11:30:37.284[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m45[0m - [1mcheck_availability_and_install ['langchain'][0m
[32m2023-12-13 11:30:37.341[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m45[0m - [1mcheck_availability_and_install ['unstructured'][0m
[32m2023-12-13 11:30:37.344[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m45[0m - [1mcheck_availability_and_install ['ftfy', 'selectolax']

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


2023-12-13 11:31:01,645	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Map(<lambda>)->FlatMap(<lambda>)->Filter(<lambda>)->FlatMap(<lambda>)]
2023-12-13 11:31:01,649	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-13 11:31:01,652	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/4 [00:00<?, ?it/s]

2023-12-13 11:31:02,271	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Map(<lambda>)->FlatMap(<lambda>)->Filter(<lambda>)->FlatMap(<lambda>)->Write]
2023-12-13 11:31:02,273	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-13 11:31:02,274	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/4 [00:00<?, ?it/s]

2023-12-13 11:31:02,685	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Map(<lambda>)->FlatMap(<lambda>)->Filter(<lambda>)->FlatMap(<lambda>)]
2023-12-13 11:31:02,686	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-13 11:31:02,688	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/4 [00:00<?, ?it/s]

execute with ray took 25.74781428091228 sec


Unnamed: 0,text,metadata
0,Welcome to cnvrg.io cnvrg.io is a machine lear...,{'source': 'https://app.cnvrg.io/docs/'}
1,Dataset Use Datasets to manage data with versi...,{'source': 'https://app.cnvrg.io/docs/'}
2,Use Papers to consolidate comparison across ex...,{'source': 'https://app.cnvrg.io/docs/'}
3,Easily update your running serving to keep it ...,{'source': 'https://app.cnvrg.io/docs/'}
4,Tutorials and Examples To help you get started...,{'source': 'https://app.cnvrg.io/docs/collecti...
...,...,...
92,"For example, gputype=v100 .To specify several,...",{'source': 'https://app.cnvrg.io/docs/core_con...
93,"For example, gputype=v100 .To specify several,...",{'source': 'https://app.cnvrg.io/docs/core_con...
94,"For example, gputype=v100 .To specify several,...",{'source': 'https://app.cnvrg.io/docs/core_con...
95,"options are: cnvrg, dockerhub, gcr, acr, ecr, ...",{'source': 'https://app.cnvrg.io/docs/core_con...
