# RecDP LLM - RAG Text fixer

Clean and refine the documents for RAG process.






# Get started

## 1. Install pyrecdp and dependencies

In [None]:
! DEBIAN_FRONTEND=noninteractive apt-get install -y openjdk-8-jre
! pip install pyrecdp --pre
# ! pip install 'git+https://github.com/intel/e2eAIOK.git#egg=pyrecdp&subdirectory=RecDP'

## 2. Usage

### 2.1 Use UrlLoader to load a list of urls. Show results without using RAGTextFix.





In [None]:
from pyrecdp.LLM import TextPipeline
from pyrecdp.primitives.operations import *

urls = ['https://app.cnvrg.io/docs/',
        'https://app.cnvrg.io/docs/core_concepts/python_sdk_v2.html',
        'https://app.cnvrg.io/docs/cli_v2/cnvrgv2_cli.html',
        'https://app.cnvrg.io/docs/collections/tutorials.html']

loader = UrlLoader(urls=url, max_depth=2, extractor=lambda x: Soup(x, "html.parser").text)
ds = loader.process_rayds()
for row in ds.iter_rows():
    print(row['text'])


### 2.2 Use UrlLoader to load a list of urls recursively and use RAGTextFix to clean up documents.

* Step 1: Fix unicode errors in text using ftfy

* Step 2: Normalize different kinds of whitespaces to whitespace ' ' (0x20) in text. Different kinds of whitespaces can be found here: https://en.wikipedia.org/wiki/Whitespace_character

* Step 3: Clean specific chars in text.

* Step 4: Replace the specified string.

* Step 5: Remove extra whitespaces.

* Step 6: Re segment sentences in the text to avoid sentence segmentation errors caused by unnecessary line breaks.

In [None]:
from pyrecdp.LLM import TextPipeline
from pyrecdp.primitives.operations import *

output_path = "TextPipeline_output"
pipeline = TextPipeline()
ops = [
    loader,
    RAGTextFix(str_to_replace={'\n###': '', '\n##': '', '\n#': ''}, remove_extra_whitespace=True),
    ParquetWriter(output_path)
]
pipeline.add_operations(ops)
ret = pipeline.execute()
del pipeline

### 2.3 Show results after RAGTextFix

In [None]:
import pandas as pd
result_pd = pd.read_parquet(output_path)
result_pd.head()