# From llama-index to DPK

### This notebook shows how llama-index nodes can be converted to parquet file and then processed by DPK.

In [None]:
%pip install -qq -r requirements.txt

In [None]:
%pip install -qq -r dpk-requirements.txt

In [None]:
%pip install -qq llama-index%pip install -qq llama-index-readers-wikipedia%pip install -qq wikipedia

In [1]:
import shutil
import os
cwd = os.getcwd()

output_base_path = f"{cwd}/output"

output_folder =  f"{output_base_path}/llama_index/"

shutil.rmtree(output_base_path, ignore_errors=True)
print (f"✅ Cleared {output_folder} directory")
os.mkdir(output_base_path)
os.mkdir(output_folder)

✅ Cleared /Users/alexey/goWork/src/github.com/data-preprocessing/data-prep-lab/examples/agentic-workflow/output/llama_index/ directory


## Ingest Wikipedia page

In [2]:
from llama_index.core import Document, SummaryIndex
from llama_index.readers.wikipedia import WikipediaReader
from llama_index.core.node_parser import SimpleNodeParser
from pathlib import Path
import pandas as pd

loader = WikipediaReader()
documents = loader.load_data(pages=["swear words"])
parser = SimpleNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents(documents)

#for node in nodes:
#    print(node.text)
#    print("+++")


## Convert llama-index nodes to rows in parquet table

In [3]:
# convert llama-index node metedata keys to table columns
idx=0
data={}
data["contents"]=[]
idx=0
for node in nodes:
    # convert node metadata keys to columns
    for k, v in node.metadata.items():
        if idx==0:
            data[k]=[v]
        else:
            data[k].append(v)
    data["contents"].append(node.text)
    idx=idx+1
df = pd.DataFrame.from_dict(data)

## Save the parquet file

In [4]:
df.to_parquet(output_folder+'out.parquet')

## Run DPK transforms to remove chuncks with bad words.

### Apply DPK Doc quality

In [5]:
from dpk_doc_quality.transform_python import DocQuality
DocQuality(input_folder=output_folder,
            output_folder= output_folder+'docq',
            docq_text_lang = "en",
            docq_doc_content_column ="contents").transform()



13:28:37 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': '/Users/alexey/goWork/src/github.com/data-preprocessing/data-prep-lab/examples/agentic-workflow/venv/lib/python3.12/site-packages/dpk_doc_quality/ldnoobw/en', 's3_cred': None, 'docq_data_factory': <data_processing.data_access.data_access_factory.DataAccessFactory object at 0x16aa861b0>}
13:28:37 INFO - data factory docq_ is using local configuration without input/output path
13:28:37 INFO - data factory docq_ max_files -1, n_sample -1
13:28:37 INFO - data factory docq_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']
13:28:37 INFO - pipeline id pipeline_id
13:28:37 INFO - code location None
13:28:37 INFO - data factory data_ is using local data access: input_folder - /Users/alexey/goWork/src/github.com/data-preprocessing/data-prep-lab/examples/agentic-workflow/output/llama_index/ out

0

### Print the output

In [6]:
from my_utils import read_parquet_files_as_df

output_df = read_parquet_files_as_df(output_folder+'docq')

output_df[['contents','docq_contain_bad_word']]

Unnamed: 0,contents,docq_contain_bad_word
0,"Profanity, also known as swearing, cursing, or...",True
1,== Subjects ==\nProfanities have literal meani...,True
2,=== Anatomy and sexuality ===\n\nProfanity rel...,True
3,=== Other subjects ===\nIllness has historical...,True
4,"In some circumstances, swearing can be used as...",True
5,Though profanity is usually associated with ta...,True
6,== Censorship and avoidance ==\n\nThe idea of ...,True
7,== Legality ==\n\n\n=== Australia ===\nIn ever...,True
8,=== Pakistan ===\nPolitical leaders in Pakista...,False
9,== Religious views ==\n\n\n=== Judaism ===\nRa...,False


### Apply DPK Filtering

In [7]:
# remove rows with bad words
from dpk_filter.transform_python import Filter
Filter(input_folder= output_folder+'docq',
        output_folder= output_folder+'filter',
        filter_criteria_list= [
            "NOT docq_contain_bad_word",
        ],
       ).transform()

13:28:41 INFO - pipeline id pipeline_id
13:28:41 INFO - code location None
13:28:41 INFO - data factory data_ is using local data access: input_folder - /Users/alexey/goWork/src/github.com/data-preprocessing/data-prep-lab/examples/agentic-workflow/output/llama_index/docq output_folder - /Users/alexey/goWork/src/github.com/data-preprocessing/data-prep-lab/examples/agentic-workflow/output/llama_index/filter
13:28:41 INFO - data factory data_ max_files -1, n_sample -1
13:28:41 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']
13:28:41 INFO - orchestrator filter started at 2025-01-16 13:28:41
13:28:41 INFO - Number of files is 1, source profile {'max_file_size': 0.023977279663085938, 'min_file_size': 0.023977279663085938, 'total_file_size': 0.023977279663085938}
13:28:41 INFO - Completed 1 files (100.0%) in 0.0 min
13:28:41 INFO - Done processing 1 files, waiting for flush() compl

0

### Inspect Generated Output File

In [8]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
!wget -O 'my_utils.py'  'https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/my_utils.py'

--2025-01-16 13:28:43--  https://raw.githubusercontent.com/IBM/data-prep-kit/dev/examples/notebooks/intro/my_utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8003::154, 2606:50c0:8001::154, ...
connected. to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8000::154|:443... 
...הבושת תלבקל ןיתממ ,החלשנ HTTP תיינפ 200 OK
Length: 1856 (1.8K) [text/plain]
Saving to: ‘my_utils.py’


2025-01-16 13:28:44 (4.05 MB/s) - ‘my_utils.py’ saved [1856/1856]



In [9]:
from my_utils import read_parquet_files_as_df

output_df = read_parquet_files_as_df(output_folder+'filter')

print ("Output data dimensions (rows x columns)= ", output_df.shape)

output_df

Output data dimensions (rows x columns)=  (4, 12)


Unnamed: 0,contents,docq_total_words,docq_mean_word_len,docq_symbol_to_word_ratio,docq_sentence_count,docq_lorem_ipsum_ratio,docq_curly_bracket_ratio,docq_contain_bad_word,docq_bullet_point_ratio,docq_ellipsis_line_ratio,docq_alphabet_word_ratio,docq_contain_common_en_words
0,=== Pakistan ===\nPolitical leaders in Pakista...,793,5.104666,0.0,30,0.0,0.0,False,0.0,0.0,0.959647,True
1,== Religious views ==\n\n\n=== Judaism ===\nRa...,460,5.093478,0.0,23,0.0,0.0,False,0.0,0.0,0.952174,True
2,=== Islam ===\nAccording to Ayatullah Ibrahim ...,264,7.416667,0.0,78,0.0,0.0,False,0.0,0.0,0.825758,True
3,"== Further reading ==\nAlmond, Ian (2003). ""De...",265,6.033962,0.0,51,0.0,0.0,False,0.0,0.0,0.85283,True


### Apply DPK text encoding

In [10]:
%pip install -qq -U ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [11]:
from dpk_text_encoder.transform_python import TextEncoder
from data_processing.utils import GB

x=TextEncoder(input_folder= output_folder+'filter', 
               output_folder= output_folder+'encoding', 
               text_encoder_model_name = 'BAAI/bge-small-en-v1.5').transform()

13:28:51 INFO - text_encoder parameters are : {'content_column_name': 'contents', 'output_embeddings_column_name': 'embeddings', 'model_name': 'BAAI/bge-small-en-v1.5'}
13:28:51 INFO - pipeline id pipeline_id
13:28:51 INFO - code location None
13:28:51 INFO - data factory data_ is using local data access: input_folder - /Users/alexey/goWork/src/github.com/data-preprocessing/data-prep-lab/examples/agentic-workflow/output/llama_index/filter output_folder - /Users/alexey/goWork/src/github.com/data-preprocessing/data-prep-lab/examples/agentic-workflow/output/llama_index/encoding
13:28:51 INFO - data factory data_ max_files -1, n_sample -1
13:28:51 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']
13:28:51 INFO - orchestrator text_encoder started at 2025-01-16 13:28:51
13:28:51 INFO - Number of files is 1, source profile {'max_file_size': 0.01585102081298828, 'min_file_size': 0.015

In [12]:
from my_utils import read_parquet_files_as_df

output_df = read_parquet_files_as_df(output_folder+'encoding')

print ("Output data dimensions (rows x columns)= ", output_df.shape)

output_df

Output data dimensions (rows x columns)=  (4, 13)


Unnamed: 0,contents,docq_total_words,docq_mean_word_len,docq_symbol_to_word_ratio,docq_sentence_count,docq_lorem_ipsum_ratio,docq_curly_bracket_ratio,docq_contain_bad_word,docq_bullet_point_ratio,docq_ellipsis_line_ratio,docq_alphabet_word_ratio,docq_contain_common_en_words,embeddings
0,=== Pakistan ===\nPolitical leaders in Pakista...,793,5.104666,0.0,30,0.0,0.0,False,0.0,0.0,0.959647,True,"[-0.038121343, -0.03633737, 0.03001021, -0.034..."
1,== Religious views ==\n\n\n=== Judaism ===\nRa...,460,5.093478,0.0,23,0.0,0.0,False,0.0,0.0,0.952174,True,"[0.0014923534, 0.04133316, 0.010005907, -0.015..."
2,=== Islam ===\nAccording to Ayatullah Ibrahim ...,264,7.416667,0.0,78,0.0,0.0,False,0.0,0.0,0.825758,True,"[-0.048527546, 0.008751696, -0.028325684, 0.01..."
3,"== Further reading ==\nAlmond, Ian (2003). ""De...",265,6.033962,0.0,51,0.0,0.0,False,0.0,0.0,0.85283,True,"[-0.038133983, -0.007959435, 0.016050057, 0.01..."


## Load Processed Data into Vector Database
### ref: https://github.com/IBM/data-prep-kit/blob/dev/examples/notebooks/rag/rag_1B_load_data_into_milvus.ipynb

### Load Parquet Data

In [13]:
import pandas as pd
import glob

print ('Loading data from : ', output_folder+'encoding')

data_df = read_parquet_files_as_df(output_folder+'encoding')

print (f"\nTotal number of rows = {data_df.shape[0]}")

Loading data from :  /Users/alexey/goWork/src/github.com/data-preprocessing/data-prep-lab/examples/agentic-workflow/output/llama_index/encoding

Total number of rows = 4


In [14]:
## Shape the data

EMBEDDING_LENGTH =  len(data_df.iloc[0]['embeddings'])
print ('embedding length: ', EMBEDDING_LENGTH)

# rename 'embeddings' columns as 'vector' to match default schema
# if 'vector' not in data_df.columns and 'embeddings' in data_df.columns:
#     data_df = data_df.rename( columns= {'embeddings' : 'vector'})
# if 'text' not in data_df.columns and 'contents' in data_df.columns:
#     data_df = data_df.rename( columns= {'contents' : 'text'})

data_df = data_df.rename( columns= {'embeddings' : 'vector', 'contents' : 'text'})

print (data_df.info())
data_df.head(3)

embedding length:  384
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   text                          4 non-null      object 
 1   docq_total_words              4 non-null      int64  
 2   docq_mean_word_len            4 non-null      float64
 3   docq_symbol_to_word_ratio     4 non-null      float64
 4   docq_sentence_count           4 non-null      int64  
 5   docq_lorem_ipsum_ratio        4 non-null      float64
 6   docq_curly_bracket_ratio      4 non-null      float64
 7   docq_contain_bad_word         4 non-null      bool   
 8   docq_bullet_point_ratio       4 non-null      float64
 9   docq_ellipsis_line_ratio      4 non-null      float64
 10  docq_alphabet_word_ratio      4 non-null      float64
 11  docq_contain_common_en_words  4 non-null      bool   
 12  vector                        4 non-null     

Unnamed: 0,text,docq_total_words,docq_mean_word_len,docq_symbol_to_word_ratio,docq_sentence_count,docq_lorem_ipsum_ratio,docq_curly_bracket_ratio,docq_contain_bad_word,docq_bullet_point_ratio,docq_ellipsis_line_ratio,docq_alphabet_word_ratio,docq_contain_common_en_words,vector
0,=== Pakistan ===\nPolitical leaders in Pakista...,793,5.104666,0.0,30,0.0,0.0,False,0.0,0.0,0.959647,True,"[-0.038121343, -0.03633737, 0.03001021, -0.034..."
1,== Religious views ==\n\n\n=== Judaism ===\nRa...,460,5.093478,0.0,23,0.0,0.0,False,0.0,0.0,0.952174,True,"[0.0014923534, 0.04133316, 0.010005907, -0.015..."
2,=== Islam ===\nAccording to Ayatullah Ibrahim ...,264,7.416667,0.0,78,0.0,0.0,False,0.0,0.0,0.825758,True,"[-0.048527546, 0.008751696, -0.028325684, 0.01..."


### Connect to Vector Database

In [15]:
%pip install -qq -U pymilvus
%pip install -qq pymilvus[model]

Note: you may need to restart the kernel to use updated packages.
zsh:1: no matches found: pymilvus[model]
Note: you may need to restart the kernel to use updated packages.


In [16]:
from pymilvus import MilvusClient

milvus_client = MilvusClient("./milvus_demo.db")

print ("✅ Connected to Milvus instance:", "./milvus_demo.db")

✅ Connected to Milvus instance: ./milvus_demo.db


### Create A Collection

In [17]:
COLLECTION_NAME="test"
if milvus_client.has_collection(collection_name=COLLECTION_NAME):
    milvus_client.drop_collection(collection_name=COLLECTION_NAME)
    print ('✅ Cleared collection :', COLLECTION_NAME)


milvus_client.create_collection(
    collection_name=COLLECTION_NAME,
    dimension=EMBEDDING_LENGTH,
    metric_type="IP",  # Inner product distance
    consistency_level="Strong",  # Strong consistency level
    auto_id=True
)
print ("✅ Created collection :", COLLECTION_NAME)

✅ Cleared collection : test
✅ Created collection : test


In [18]:
res = milvus_client.insert(collection_name=COLLECTION_NAME, data=data_df.to_dict('records'))

print('inserted # rows', res['insert_count'])

milvus_client.get_collection_stats(COLLECTION_NAME)

inserted # rows 4


{'row_count': 4}

### Close DB Connection

In [19]:
milvus_client.close()

print ("✅ SUCCESS")

✅ SUCCESS
