# Read Documents, Split and Load into VectorDB 

- Read documents with pyMupdfreader
- Split documents with 
- Create Embedding Function
- VectorDB is ChromaDB

In [1]:
import chromadb

Import project config and functions

In [2]:
from dotenv import load_dotenv

import os # Importing os module for operating system functionalities
#import shutil # Importing shutil module for high-level file operations

import sys
import importlib
load_dotenv()

# ************************************************
# *** Add PACKAGE_PATH to your .env file with full
# ***   path to your local_rag_llm/run/pkgs folder
# ************************************************
package_path = os.getenv('PACKAGE_PATH')
# ************************************************

#package_path = "/mnt/c/ML/DU/local_rag_llm/run/pkgs"
sys.path.insert(1, package_path)
import config
import rag_functions

package_path

'/mnt/c/ML/DU/local_rag_llm/run/pkgs'

Define variables

In [3]:
# reload any changes to Config Settings
importlib.reload(config)

project_path = config.project_path
print(f"Project_path: {project_path}")

doc_folder = config.data_folder_aiml
print(f"Doc Folder: {doc_folder}")

models_path = config.models_path
print(f"Models path: {models_path}")

loaded_documents = project_path + "run/pkgs/loaded_files.pkl"
print(f"Loaded Docs file: {loaded_documents}")

Project_path: /mnt/c/ML/DU/local_rag_llm/
Doc Folder: /mnt/c/ML/DU/local_rag_llm/data/aiml/
Models path: /mnt/c/ML/DU/local_rag_llm/models/
Loaded Docs file: /mnt/c/ML/DU/local_rag_llm/run/pkgs/loaded_files.pkl


### Create an Embedding Function

we want to use this GPU embedding function.  Not sure how to do that with http client.  So this is for future

In [4]:
# from torch import cuda
# from langchain.embeddings.huggingface import HuggingFaceEmbeddings

# embed_model_id = models_path + 'sentence-transformers/all-MiniLM-L6-v2'

# device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# embed_model = HuggingFaceEmbeddings(
#     model_name=embed_model_id,
#     model_kwargs={'device': device},
#     encode_kwargs={'device': device, 'batch_size': 32}
# )

# print(f"Device: {device}")

This embedding function will be used with ChromaDB.  Does not use GPU.  :(

In [5]:
from chromadb.utils import embedding_functions

# Use a different sentence transformer: all-mpnet-base-v2
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


### Load Documents

- **Takes:** doc_folder, loaded_documents
- **Creates:** documents

In [6]:
# reload any changes to rag_functions
importlib.reload(rag_functions)

# Get list of documents already read
imported_docs = rag_functions.load_imported_docs(loaded_documents)

# Uncomment to reset to empty list
imported_docs = {}
imported_docs_updated, documents = rag_functions.read_doc_directory( doc_folder, imported_docs)

rag_functions.write_imported_docs(imported_docs_updated, loaded_documents)

The file /mnt/c/ML/DU/local_rag_llm/run/pkgs/loaded_files.pkl exists.
Loaded: A Primer on Generative Artificial Intelligence.pdf
Loaded: Constructing Dreams using Generative AI.pdf
Loaded: Ethics of Generative AI.pdf
Loaded: Experimental evidence on the productivity effects of generative artificial intelligence.pdf


In [7]:
print(f"Document Pages: {len(documents)}")
rag_functions.print_dict(imported_docs_updated, "imported docs")


Document Pages: 31

Dictionary: imported docs
  A Primer on Generative Artificial Intelligence.pdf: /mnt/c/ML/DU/local_rag_llm/data/aiml/A Primer on Generative Artificial Intelligence.pdf
  Constructing Dreams using Generative AI.pdf: /mnt/c/ML/DU/local_rag_llm/data/aiml/Constructing Dreams using Generative AI.pdf
  Ethics of Generative AI.pdf: /mnt/c/ML/DU/local_rag_llm/data/aiml/Ethics of Generative AI.pdf
  Experimental evidence on the productivity effects of generative artificial intelligence.pdf: /mnt/c/ML/DU/local_rag_llm/data/aiml/Experimental evidence on the productivity effects of generative artificial intelligence.pdf


### Split Documents

- **Takes:** documents
- **Creates:** doc_chunks

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
doc_chunks = text_splitter.split_documents(documents)

print(f"Num Doc Chunks: {len(doc_chunks)}")

Num Doc Chunks: 191


In [9]:
print(doc_chunks[4])

page_content='ciated with intelligence also apply to AI; therefore, as we take a deeper dive into AI, we
should reflect upon the relevance of this definition to AI.
Artificial intelligence (AI) has gained much momentum in recent years due to the
advancements in hardware and software technologies. AI has several definitions, and some
of them share similar attributes. Some of these definitions are listed as under:
Educ. Sci. 2024, 14, 172. https://doi.org/10.3390/educsci14020172
https://www.mdpi.com/journal/education' metadata={'source': '/mnt/c/ML/DU/local_rag_llm/data/aiml/A Primer on Generative Artificial Intelligence.pdf', 'file_path': '/mnt/c/ML/DU/local_rag_llm/data/aiml/A Primer on Generative Artificial Intelligence.pdf', 'page': 0, 'total_pages': 15, 'format': 'PDF 1.7', 'title': 'A Primer on Generative Artificial Intelligence', 'author': 'Faisal Kalota', 'subject': 'Many educators and professionals in different industries may need to become more familiar with the basic concepts 

In [10]:
rag_functions.print_dict(doc_chunks[4].metadata, 'metadata')


Dictionary: metadata
  source: /mnt/c/ML/DU/local_rag_llm/data/aiml/A Primer on Generative Artificial Intelligence.pdf
  file_path: /mnt/c/ML/DU/local_rag_llm/data/aiml/A Primer on Generative Artificial Intelligence.pdf
  page: 0
  total_pages: 15
  format: PDF 1.7
  title: A Primer on Generative Artificial Intelligence
  author: Faisal Kalota
  subject: Many educators and professionals in different industries may need to become more familiar with the basic concepts of artificial intelligence (AI) and generative artificial intelligence (Gen-AI). Therefore, this paper aims to introduce some of the basic concepts of AI and Gen-AI. The approach of this explanatory paper is first to introduce some of the underlying concepts, such as artificial intelligence, machine learning, deep learning, artificial neural networks, and large language models (LLMs), that would allow the reader to better understand generative AI. The paper also discusses some of the applications and implications of genera

### Store Document Chunks into ChromaDB

- Takes: doc_chunks
- Creates: ChromaDB collection (collection_name from config.py)
- Uses embed_model / sentence_transformer_ef

Setup ChromaDB

In [11]:
collection_name = config.collection_name

print(f"Collection Name: {collection_name}")

Collection Name: Gen_AI_knowledge


In [12]:
# reload any changes to rag_functions
importlib.reload(rag_functions)

<module 'rag_functions' from '/mnt/c/ML/DU/local_rag_llm/run/pkgs/rag_functions.py'>

In [13]:
#from chromadb.config import Settings

client = chromadb.HttpClient(host='localhost', port=8200)
#client.reset()  # resets the database

# Get list of collections
collections = client.list_collections()

# Determine if collection already exist
exists = rag_functions.collection_exists(collections, collection_name)

# Clear and create collection
if exists:
    # Delete and Recreate
    client.delete_collection(name=collection_name)
    collection = client.create_collection(
                        name=collection_name, 
                        embedding_function=sentence_transformer_ef)
else:
    collection = client.create_collection(
                        name=collection_name, 
                        embedding_function=sentence_transformer_ef)


In [14]:
collection_len = collection.count()
collection_list = collection.peek()

rag_functions.print_list(collections, "ChromaDB Collections")
print(f"Num Items in collection\n{collection_len}")
rag_functions.print_dict(collection_list, "First 10 items in collection")



List: ChromaDB Collections
  0: Collection(id=117f7132-8837-4bf2-9e3d-b7661f82bdb3, name=knowledge_base)
  1: Collection(id=588ec510-0b68-4c99-884d-5738768d1710, name=my_collection)
  2: Collection(id=a9cb1836-5fd3-4643-8495-118745c8d302, name=Gen_AI_knowledge)
  3: Collection(id=dd394da5-2c20-4bce-8374-89a0ce611153, name=ML_doc_collection)
  4: Collection(id=f31f0737-b78e-46c9-86fd-efc4a56fffe9, name=local_kb)
Num Items in collection
0

Dictionary: First 10 items in collection
  ids: []
  embeddings: []
  metadatas: []
  documents: []
  data: None
  uris: None
  included: ['embeddings', 'documents', 'metadatas']


Prepare Chunks for addition to DB

In [15]:
# Prepare chunks for ChromaDB

metadata_list = []
content_list = []
#id_list = []
for i, document in enumerate(doc_chunks):
    content     = document.page_content
    metadata    = document.metadata
    file_path   = metadata['source']
    page        = metadata['page']
#    index       = metadata['start_index']
    
    file_parts  = file_path.split('/')
    filename = file_parts[len(file_parts)-1]
    
    metadata['filename'] = filename
    
#    id = filename + "_" + str(page) + "_" + str(index)
    
    content_list.append(content)
    metadata_list.append(metadata)
#    id_list.append(id)

In [16]:
print(f"Number of chunks: {len(content_list)}")

Number of chunks: 191


Add Chunks to the DB

Note:  this code using vectorstore is desired.  However, it does not work with an chroma http client.  Need to investigate further

In [17]:
# from langchain.vectorstores import Chroma
# from langchain.embeddings import GPT4AllEmbeddings

# vectorstore = Chroma.from_documents(documents=all_splits, embedding=embed_model)

Adding Chunks the more manual way to ChromaDB

In [18]:
# create the chroma client
import uuid

for i in range(len(content_list)):
    collection.add(
        ids=str(uuid.uuid1()), 
        metadatas=metadata_list[i], 
        documents=content_list[i]
    )
    if (i % 100 == 0):
        print (f"Loaded {i}/{len(content_list)} document chunks")



Loaded 0/191 document chunks
Loaded 100/191 document chunks


In [22]:
collection_len = collection.count()
collection_list = collection.peek()

rag_functions.print_list(collections, "ChromaDB Collections")
print(f"Num Items in collection\n{collection_len}")
rag_functions.print_dict(collection_list, "First 10 items in collection")


List: ChromaDB Collections
  0: Collection(id=117f7132-8837-4bf2-9e3d-b7661f82bdb3, name=knowledge_base)
  1: Collection(id=588ec510-0b68-4c99-884d-5738768d1710, name=my_collection)
  2: Collection(id=a9cb1836-5fd3-4643-8495-118745c8d302, name=Gen_AI_knowledge)
  3: Collection(id=dd394da5-2c20-4bce-8374-89a0ce611153, name=ML_doc_collection)
  4: Collection(id=f31f0737-b78e-46c9-86fd-efc4a56fffe9, name=local_kb)
Num Items in collection
191

Dictionary: First 10 items in collection
  ids: ['55d23170-503b-11ef-9594-b9cfb04757a0', '55d23171-503b-11ef-9594-b9cfb04757a0', '55d23172-503b-11ef-9594-b9cfb04757a0', '56bc6696-503b-11ef-9594-b9cfb04757a0', '56bc6697-503b-11ef-9594-b9cfb04757a0', '56bc6698-503b-11ef-9594-b9cfb04757a0', '56bc6699-503b-11ef-9594-b9cfb04757a0', '580b6416-503b-11ef-9594-b9cfb04757a0', '580b6417-503b-11ef-9594-b9cfb04757a0', '580b6418-503b-11ef-9594-b9cfb04757a0']
  embeddings: [[0.032426539808511734, 0.012399652972817421, -0.0715695470571518, -0.012869068421423435, -0

### Test

In [25]:
results = collection.query(
    query_texts=[
        "What is the difference between machine learning and deep learning"
    ],
    n_results=3
)

rag_functions.print_dict(results, "query results")


Dictionary: query results
  ids: [['5a73e662-503b-11ef-9594-b9cfb04757a0', '5a73e661-503b-11ef-9594-b9cfb04757a0', '591cda08-503b-11ef-9594-b9cfb04757a0']]
  distances: [[0.4755287170410156, 0.49740636348724365, 0.8618975877761841]]
  embeddings: None
  metadatas: [[{'author': 'Faisal Kalota', 'creationDate': "D:20240219111535+02'00'", 'creator': 'LaTeX with hyperref', 'file_path': '/mnt/c/ML/DU/local_rag_llm/data/aiml/A Primer on Generative Artificial Intelligence.pdf', 'filename': 'A Primer on Generative Artificial Intelligence.pdf', 'format': 'PDF 1.7', 'keywords': 'artificial intelligence; AI; generative artificial intelligence; generative AI; GAI; GenAI; Gen-AI; ChatGPT; LLM; GPT; AI businesses; AI education; AI ethics; AI security', 'modDate': "D:20240219102017+01'00'", 'page': 4, 'producer': 'pdfTeX-1.40.25', 'source': '/mnt/c/ML/DU/local_rag_llm/data/aiml/A Primer on Generative Artificial Intelligence.pdf', 'subject': 'Many educators and professionals in different industries m

In [26]:
rag_functions.print_list(results['documents'][0], "document chunks")


List: document chunks
  0: Educ. Sci. 2024, 14, 172
5 of 15
Table 1. Differences between machine learning and deep learning.
Machine Learning
Deep Learning
Requires a relatively small amount of data for
training and prediction.
Requires large amounts of data for training
and prediction.
It does not require extensive computational
power, and low-end central processing units
(CPUs) may be sufficient.
High-end computational power is required. A
graphic processing unit (GPU) is needed.
The time to train the model is relatively small.
The time to train a model is relatively high.
Simple linear correlational models.
Non-linear complex correlational models.
The output of machine learning algorithms is
generally a numerical value.
The output is not limited to a single numeric
value but could be in different formats.
Deep learning is inspired by the human brain and utilizes artificial neural networks
(ANN). In order to understand deep learning, it is crucial to understand artificial neural
  1