# RecDP LLM - Document split

# Get started

## 1. Install pyrecdp and dependencies

In [None]:
! DEBIAN_FRONTEND=noninteractive apt-get install -y openjdk-8-jre
! pip install pyrecdp --pre
# ! pip install 'git+https://github.com/intel/e2eAIOK.git#egg=pyrecdp&subdirectory=RecDP'

## 2. prepare your own data

In [None]:
%mkdir -p /content/test_data
%cd /content/test_data
%mkdir -p /content/doc_jsonl
file_names = ['english-and-korean.png', 'handbook-872p.docx', 'layout-parser-paper-10p.jpg', 'layout-parser-paper.pdf']
file_list = [f"https://raw.githubusercontent.com/intel/e2eAIOK/main/RecDP/tests/data/llm_data/document/{i}" for i in file_names]
!wget -P /content/test_data/document/ {" ".join(file_list)}

## 3. Sentence split with llmutils

In [8]:
import pandas as pd
from pyrecdp.primitives.llmutils import sentence_split
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

samples = [(
        'Smithfield employs 3,700 people at its plant in Sioux Falls, '
        'South Dakota. The plant slaughters 19,500 pigs a day — 5 '
        'percent of U.S. pork.',
        'Smithfield employs 3,700 people at its plant in Sioux Falls, '
        'South Dakota.\nThe plant slaughters 19,500 pigs a day — 5 '
        'percent of U.S. pork.')]
input_dataset = spark.createDataFrame(pd.DataFrame(samples, columns=["text", "target"]))
ret_df = sentence_split(input_dataset)
ret_df.show(truncate=False)
for _, row in ret_df.toPandas().iterrows():
        assert(row["text"] == row["target"])


+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                        |target                                                                                                                                      |
+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+
|Smithfield employs 3,700 people at its plant in Sioux Falls, South Dakota.\nThe plant slaughters 19,500 pigs a day — 5 percent of U.S. pork.|Smithf

                                                                                

## 4. Sentence split with Operator

We prodive [DocumentSplit](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_split.py#L84) and [CustomerDocumentSplit](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_split.py#L278) to chunk documents

### 4.1 load document

In [6]:
from pyrecdp.primitives.operations import DocumentSplit,DocumentLoader

loader = DocumentLoader(loader="RecursiveUrlLoader", loader_args={"url": "https://app.cnvrg.io/docs/core_concepts/python_sdk_v2.html"})

ds = loader.process_rayds()
display(ds.to_pandas())


[2m[33m(raylet)[0m [2023-12-20 15:51:02,615 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.


Unnamed: 0,text,metadata
0,"<!DOCTYPE html>\n<html lang=""en-US"">\n <head>...",{'description': 'Documentation website for cnv...


[2m[33m(raylet)[0m [2023-12-20 15:51:12,627 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-20 15:51:22,638 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-20 15:51:32,651 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.


### 4.2 split document with DocumentSplit operator

In [7]:
spliter =  DocumentSplit(text_splitter='RecursiveCharacterTextSplitter')
ds = spliter.process_rayds(ds)
display(ds.to_pandas())

2023-12-20 15:52:00,384	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[FlatMap(<lambda>)]
2023-12-20 15:52:00,385	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-20 15:52:00,387	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text,metadata
0,"<!DOCTYPE html>\n<html lang=""en-US"">\n <head>...",{'description': 'Documentation website for cnv...
1,"<link rel=""preload"" href=""/docs/assets/css/0.s...",{'description': 'Documentation website for cnv...
2,"rel=""prefetch"" href=""/docs/assets/js/18.698a17...",{'description': 'Documentation website for cnv...
3,"rel=""prefetch"" href=""/docs/assets/js/78.75a8bd...",{'description': 'Documentation website for cnv...
4,"<link rel=""stylesheet"" href=""/docs/assets/css/...",{'description': 'Documentation website for cnv...
...,...,...
101,"</code></pre></div><h3 id=""update-an-existing-...",{'description': 'Documentation website for cnv...
102,</code></pre></div><p><strong>Available Parame...,{'description': 'Documentation website for cnv...
103,</code></pre></div><p><strong>Available Parame...,{'description': 'Documentation website for cnv...
104,</code></pre></div><p><strong>Available Parame...,{'description': 'Documentation website for cnv...


[2m[33m(raylet)[0m [2023-12-20 15:52:02,690 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-20 15:52:12,702 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-20 15:52:22,715 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-20 15:52:32,729 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; cap

### 4.3 put DocumentSplit operator in a pipeline

In [8]:
from pyrecdp.LLM import TextPipeline
 
pipeline = TextPipeline()
ops = [
    loader,
    spliter
]
pipeline.add_operations(ops)
ds = pipeline.execute()
display(ds.to_pandas())

[2m[33m(raylet)[0m [2023-12-20 15:53:02,767 E 2215683 2215698] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-49-40_736779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.


init ray
init ray with total mem of 324413575987, total core of 48


2023-12-20 15:53:09,801	INFO worker.py:1642 -- Started a local Ray instance.


execute with ray started ...


2023-12-20 15:53:13,140	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[FlatMap(<lambda>)->Write]
2023-12-20 15:53:13,142	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-20 15:53:13,143	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(FlatMap(<lambda>)->Write pid=2224997)[0m 2023-12-20 15:53:14.774 | INFO     | pyrecdp.core.import_utils:check_availability_and_install:52 - check_availability_and_install emoji==2.2.0
2023-12-20 15:53:15,081	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[FlatMap(<lambda>)]
2023-12-20 15:53:15,082	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-20 15:53:15,084	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

execute with ray took 4.245151562616229 sec


Unnamed: 0,text,metadata
0,"<!DOCTYPE html>\n<html lang=""en-US"">\n <head>...",{'description': 'Documentation website for cnv...
1,"<link rel=""preload"" href=""/docs/assets/css/0.s...",{'description': 'Documentation website for cnv...
2,"rel=""prefetch"" href=""/docs/assets/js/18.698a17...",{'description': 'Documentation website for cnv...
3,"rel=""prefetch"" href=""/docs/assets/js/78.75a8bd...",{'description': 'Documentation website for cnv...
4,"<link rel=""stylesheet"" href=""/docs/assets/css/...",{'description': 'Documentation website for cnv...
...,...,...
101,"</code></pre></div><h3 id=""update-an-existing-...",{'description': 'Documentation website for cnv...
102,</code></pre></div><p><strong>Available Parame...,{'description': 'Documentation website for cnv...
103,</code></pre></div><p><strong>Available Parame...,{'description': 'Documentation website for cnv...
104,</code></pre></div><p><strong>Available Parame...,{'description': 'Documentation website for cnv...


[2m[33m(raylet)[0m [2023-12-20 15:53:18,680 E 2224790 2224809] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-53-06_901719_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-20 15:53:28,693 E 2224790 2224809] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-53-06_901719_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-20 15:53:38,706 E 2224790 2224809] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-53-06_901719_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-20 15:53:48,720 E 2224790 2224809] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-53-06_901719_2191522 is over 95% full, available space: 0; cap

### 4.4 Split document with CustomerDocumentSplit operator in a pipeline

In [9]:
from pyrecdp.LLM import TextPipeline
from pyrecdp.primitives.operations import CustomerDocumentSplit
 
def chunk_doc(text,max_num_of_words):
    from nltk.tokenize import word_tokenize,sent_tokenize
    text= text.strip()
    if len(word_tokenize(text)) <= max_num_of_words:
        return [text]
    else:
        chunks = []
        # split by sentence
        sentences = sent_tokenize(text)
        # print('number of sentences: ', len(sentences))
        words_count = 0
        temp_chunk = ""
        for s in sentences:
            temp_chunk+=(s+" ")
            words_count += len(word_tokenize(s))
            if len(word_tokenize(temp_chunk))> max_num_of_words:
                chunks.append(temp_chunk)
                words_count = 0
                temp_chunk = ""
                
        return chunks
     
pipeline = TextPipeline()
ops = [
    loader,
     CustomerDocumentSplit(func=chunk_doc, max_num_of_words=50),
]
pipeline.add_operations(ops)
ds = pipeline.execute()
display(ds.to_pandas())

init ray
init ray with total mem of 324413575987, total core of 48


2023-12-20 15:56:00,827	INFO worker.py:1642 -- Started a local Ray instance.


execute with ray started ...


2023-12-20 15:56:03,800	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[FlatMap(<lambda>)->Write]
2023-12-20 15:56:03,802	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-20 15:56:03,803	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

[2m[36m(FlatMap(<lambda>)->Write pid=2233388)[0m 2023-12-20 15:56:05.399 | INFO     | pyrecdp.core.import_utils:check_availability_and_install:52 - check_availability_and_install emoji==2.2.0
2023-12-20 15:56:06,464	INFO streaming_executor.py:93 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[FlatMap(<lambda>)]
2023-12-20 15:56:06,465	INFO streaming_executor.py:94 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2023-12-20 15:56:06,466	INFO streaming_executor.py:96 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`


Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

execute with ray took 5.179382940754294 sec


Unnamed: 0,text,metadata
0,"<!DOCTYPE html>\n<html lang=""en-US"">\n <head>...",{'description': 'Documentation website for cnv...
1,"Through the SDK, you can create experiments, m...",{'description': 'Documentation website for cnv...
2,"First, install WSL with following commands:</p...",{'description': 'Documentation website for cnv...
3,https://app.cnvrgdomain.com/</td> <td>Yes</td>...,{'description': 'Documentation website for cnv...
4,the token can be retrieved from the <strong>us...,{'description': 'Documentation website for cnv...
...,...,...
61,"For example, <code>value1=NoSchedule</code>.To...",{'description': 'Documentation website for cnv...
62,"For example, <code>gputype=v100</code>.To spec...",{'description': 'Documentation website for cnv...
63,"For example, <code>value1=NoSchedule</code>.To...",{'description': 'Documentation website for cnv...
64,"options are: cnvrg, dockerhub, gcr, acr, ecr, ...",{'description': 'Documentation website for cnv...


[2m[33m(raylet)[0m [2023-12-20 15:56:09,706 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-20 15:56:19,719 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-20 15:56:29,731 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; capacity: 422146228224. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-12-20 15:56:39,741 E 2233193 2233212] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-20_15-55-57_818779_2191522 is over 95% full, available space: 0; cap