# RecDP LLM - RAG

# Get started

## 1. Install pyrecdp and dependencies

In [None]:
! DEBIAN_FRONTEND=noninteractive apt-get install -y openjdk-8-jre
! pip install /work/e2eAIOK/RecDP
# ! pip install 'git+https://github.com/intel/e2eAIOK.git#egg=pyrecdp&subdirectory=RecDP'

## 2. RAG Workflow

### Step 1. Setup RAG Indexer Pipeline

In [None]:
from pyrecdp.primitives.operations import *
from pyrecdp.LLM import TextPipeline
import os

urls = ['https://app.cnvrg.io/docs/', 
        'https://app.cnvrg.io/docs/core_concepts/python_sdk_v2.html',
        'https://app.cnvrg.io/docs/cli_v2/cnvrgv2_cli.html',
        'https://app.cnvrg.io/docs/collections/tutorials.html']

def custom_filter(text):
    from nltk.tokenize import word_tokenize
    ret_txt = None
    if len(word_tokenize(text)) >10:
        if text.split(' ')[0].lower()!='version':
            ret_txt = text
    return ret_txt != None

def chunk_doc(text):
    max_num_of_words=50
    from nltk.tokenize import word_tokenize,sent_tokenize
    text= text.strip()
    if len(word_tokenize(text)) <= max_num_of_words:
        return [text]
    else:
        chunks = []
        # split by sentence
        sentences = sent_tokenize(text)
        # print('number of sentences: ', len(sentences))
        words_count = 0
        temp_chunk = ""
        for s in sentences:
            temp_chunk+=(s+" ")
            words_count += len(word_tokenize(s))
            if len(word_tokenize(temp_chunk))> max_num_of_words:
                chunks.append(temp_chunk)
                words_count = 0
                temp_chunk = ""
                
        return chunks

parquet_file="/content/test/parquet"
pipeline = TextPipeline()
ops = [
    DocumentLoader(loader='UnstructuredURLLoader', loader_args={'urls': urls}),
    RAGTextFix(str_to_replace={'\n###': '', '\n##': '', '\n#': ''}, remove_extra_whitespace=True),
    CustomerDocumentSplit(func=lambda text: text.split('# ')[1:]),
    TextCustomerFilter(custom_filter),
    CustomerDocumentSplit(func=chunk_doc),
    DocumentIngestion(
        rag_framework='haystack',
        vector_store='elasticsearch',
        embeddings=None,
        vector_store_args={'host': 'localhost', 'port': 9200}
    )
]
pipeline.add_operations(ops)


### Step 2. Run RAG Indexer Pipeline

In [2]:
ds = pipeline.execute()
display(ds.to_pandas())

[32m2023-12-12 21:52:00.129[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m45[0m - [1mcheck_availability_and_install ['ftfy', 'selectolax'][0m
[32m2023-12-12 21:52:00.131[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m45[0m - [1mcheck_availability_and_install ['langchain'][0m


init ray
init ray with total mem of 324413575987, total core of 48


2023-12-12 21:52:03,943	INFO worker.py:1642 -- Started a local Ray instance.


execute with ray started ...
[32m2023-12-12 21:52:25.937[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m50[0m - [1mcheck_availability_and_install nltk[0m
[32m2023-12-12 21:52:25.940[0m | [1mINFO    [0m | [36mpyrecdp.core.model_utils[0m:[36mprepare_nltk_model[0m:[36m164[0m - [1mLoading nltk punkt split model...[0m
[32m2023-12-12 21:52:25.954[0m | [1mINFO    [0m | [36mpyrecdp.core.import_utils[0m:[36mcheck_availability_and_install[0m:[36m45[0m - [1mcheck_availability_and_install ['farm-haystack', 'farm-haystack[elasticsearch7]'][0m


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


2023-12-12 21:52:41,235	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[Map(<lambda>)->FlatMap(<lambda>)->Filter(<lambda>)->FlatMap(<lambda>)]
2023-12-12 21:52:41,237	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-12 21:52:41,239	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/4 [00:00<?, ?it/s]

[2m[36m(Map(<lambda>)->FlatMap(<lambda>)->Filter(<lambda>)->FlatMap(<lambda>) pid=752838)[0m 2023-12-12 21:52:43.205 | INFO     | pyrecdp.core.import_utils:check_availability_and_install:50 - check_availability_and_install emoji==2.2.0


execute with ray took 40.127191949635744 sec


RayTaskError(TypeError): [36mray::Map(<lambda>)->FlatMap(<lambda>)->Filter(<lambda>)->FlatMap(<lambda>)()[39m (pid=752835, ip=10.0.0.137)
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_operator.py", line 405, in _map_task
    for b_out in map_transformer.apply_transform(iter(blocks), ctx):
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 345, in __call__
    for data in iter:
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 171, in __call__
    yield from self._row_fn(input, ctx)
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py", line 244, in transform_fn
    for row in rows:
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 223, in __call__
    for block in blocks:
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 345, in __call__
    for data in iter:
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 171, in __call__
    yield from self._row_fn(input, ctx)
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py", line 256, in transform_fn
    for row in rows:
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 223, in __call__
    for block in blocks:
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 345, in __call__
    for data in iter:
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 171, in __call__
    yield from self._row_fn(input, ctx)
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py", line 245, in transform_fn
    for out_row in fn(row):
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/ray/data/_internal/planner/plan_udf_map_op.py", line 119, in fn
    return op_fn(item, *fn_args, **fn_kwargs)
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/pyrecdp/primitives/operations/text_split.py", line 69, in <lambda>
    return ds.flat_map(lambda sample: split_text(sample, self.text_split_func))
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/pyrecdp/primitives/operations/text_split.py", line 63, in split_text
    for text in text_split_func(sample[self.text_key]):
  File "/root/miniforge3/envs/recdp/lib/python3.10/site-packages/pyrecdp/primitives/operations/text_split.py", line 296, in process
    return self.split_func(text, **self.func_kwargs)
TypeError: 'str' object is not callable