In [1]:
# Langchain dependencies
from langchain.document_loaders.pdf import PyPDFDirectoryLoader # Importing PDF loader from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
from langchain.embeddings import OpenAIEmbeddings # Importing OpenAI embeddings from Langchain
from langchain.prompts import ChatPromptTemplate
from langchain.schema import Document # Importing Document schema from Langchain
from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain
from dotenv import load_dotenv # Importing dotenv to get API key from .env file
from langchain.chat_models import ChatOpenAI # Import OpenAI LLM
import chromadb

import os # Importing os module for operating system functionalities
import shutil # Importing shutil module for high-level file operations

In [2]:
# Directory to your pdf files:
DATA_PATH = "../data"
CHROMA_PATH = "chroma"

In [3]:
def load_documents():
  """
  Load PDF documents from the specified directory using PyPDFDirectoryLoader.
  Returns:
  List of Document objects: Loaded PDF documents represented as Langchain
                                                          Document objects.
  """
  # Initialize PDF loader with specified directory
  document_loader = PyPDFDirectoryLoader(DATA_PATH) 
  # Load PDF documents and return them as a list of Document objects
  return document_loader.load() 

In [4]:
def split_text(documents: list[Document]):
  """
  Split the text content of the given list of Document objects into smaller chunks.
  Args:
    documents (list[Document]): List of Document objects containing text content to split.
  Returns:
    list[Document]: List of Document objects representing the split text chunks.
  """
  # Initialize text splitter with specified parameters
  text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300, # Size of each chunk in characters
    chunk_overlap=100, # Overlap between consecutive chunks
    length_function=len, # Function to compute the length of the text
    add_start_index=True, # Flag to add start index to each chunk
  )

  # Split documents into smaller chunks using text splitter
  chunks = text_splitter.split_documents(documents)
  print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

  # Print example of page content and metadata for a chunk
  document = chunks[0]
  print(document.page_content)
  print(document.metadata)

  return chunks # Return the list of split text chunks

In [5]:
# def save_to_chroma(chunks: list[Document]):
#   """
#   Save the given list of Document objects to a Chroma database.
#   Args:
#   chunks (list[Document]): List of Document objects representing text chunks to save.
#   Returns:
#   None
#   """

#   # Clear out the existing database directory if it exists
#   if os.path.exists(CHROMA_PATH):
#     shutil.rmtree(CHROMA_PATH)

#   db = 
#   # Create a new Chroma database from the documents using OpenAI embeddings
#   db = Chroma.from_documents(
#     chunks,
#     OpenAIEmbeddings(),
#     persist_directory=CHROMA_PATH
#   )

#   # Persist the database to disk
#   db.persist()
#   print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [6]:
def collection_exists(collections, collection_name):
    for collection in collections:
        if collection.name == collection_name:
            return True
    return False

In [7]:
# def generate_data_store():
#   """
#   Function to generate vector database in chroma from documents.
#   """
#   documents = load_documents() # Load documents from a source
#   chunks = split_text(documents) # Split documents into manageable chunks
  
  
  
  
#   save_to_chroma(chunks) # Save the processed data to a data store

## Start workflow here

Create an Embedding Function

In [8]:
from chromadb.utils import embedding_functions

# Use a different sentence transformer: all-mpnet-base-v2
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


Create Client and Collection

In [9]:
# Get a chromadb client
client = chromadb.HttpClient(host='localhost', port=8200)

# chroma_collection = client.get_collection(  name=collection_name, 
#                                             embedding_function=sentence_transformer_ef)

In [10]:
# Create the collection
collection_name = "local_kb"

# Get list of collections
collections = client.list_collections()

# Determine if collection already exist
exists = collection_exists(collections, collection_name)

# Clear and create collection
if exists:
    # Delete and Recreate
    client.delete_collection(name=collection_name)
    chroma_collection = client.create_collection(
                        name=collection_name, 
                        embedding_function=sentence_transformer_ef)
else:
    chroma_collection = client.create_collection(
                        name=collection_name, 
                        embedding_function=sentence_transformer_ef)

Load and split Documents

In [11]:
# Load all the documents and create document structure
documents = load_documents() # Call the function

# Split documents into chunks
chunks = split_text(documents)

Split 42 documents into 544 chunks.
Citation: Kalota, F. A Primer on
Generative Artificial Intelligence.
Educ. Sci. 2024 ,14, 172. https://
doi.org/10.3390/educsci14020172
Academic Editors: Gary K. W. Wong
and Ho-Yin Cheung
Received: 6 August 2023
Revised: 20 January 2024
Accepted: 24 January 2024
Published: 7 February 2024
{'source': '../data/A Primer on Generative Artificial Intelligence.pdf', 'page': 0, 'start_index': 0}


Inspect a chunk

In [25]:
document = chunks[4]

print(document)
print(f"\n{document.page_content}")
print(f"\n{document.metadata}")


page_content='Abstract: Many educators and professionals in different industries may need to become more familiar
with the basic concepts of artificial intelligence (AI) and generative artificial intelligence (Gen-AI).' metadata={'source': '../data/A Primer on Generative Artificial Intelligence.pdf', 'page': 0, 'start_index': 749, 'filename': 'A Primer on Generative Artificial Intelligence.pdf'}

Abstract: Many educators and professionals in different industries may need to become more familiar
with the basic concepts of artificial intelligence (AI) and generative artificial intelligence (Gen-AI).

{'source': '../data/A Primer on Generative Artificial Intelligence.pdf', 'page': 0, 'start_index': 749, 'filename': 'A Primer on Generative Artificial Intelligence.pdf'}


In [13]:
len(chunks)

544

Load documents and add to DB

In [16]:
metadata_list = []
content_list = []
id_list = []
for i, document in enumerate(chunks):
    content     = chuck[i].page_content
    file_path   = chuck[i].metadata['source']
    page        = chuck[i].metadata['page']
    index       = chuck[i].metadata['start_index']
    
    file_parts  = file_path.split('/')
    filename = file_parts[len(file_parts)-1]
    
    chuck[i].metadata['filename'] = filename
    
    id = filename + "_" + str(page) + "_" + str(index)
    
    content_list.append(content)
    metadata_list.append(chuck[i].metadata)
    id_list.append(id)

chroma_collection.add(
    documents=content_list,
    metadatas=metadata_list,
    ids=id_list
)

**Test:**  Query the collection

In [26]:
collection_len = chroma_collection.count()
collection_list = chroma_collection.peek()

print(collection_len)

print(collection_list)

544
{'ids': ['A Primer on Generative Artificial Intelligence.pdf_0_0', 'A Primer on Generative Artificial Intelligence.pdf_0_1253', 'A Primer on Generative Artificial Intelligence.pdf_0_1450', 'A Primer on Generative Artificial Intelligence.pdf_0_1606', 'A Primer on Generative Artificial Intelligence.pdf_0_1783', 'A Primer on Generative Artificial Intelligence.pdf_0_1969', 'A Primer on Generative Artificial Intelligence.pdf_0_212', 'A Primer on Generative Artificial Intelligence.pdf_0_2135', 'A Primer on Generative Artificial Intelligence.pdf_0_2397', 'A Primer on Generative Artificial Intelligence.pdf_0_2520'], 'embeddings': [[0.003859291784465313, 0.02870168723165989, -0.05448075756430626, -0.03492660075426102, -0.05232284218072891, -0.008302228525280952, 0.030071867629885674, 0.003325331024825573, -0.02789858542382717, 0.03346269577741623, 0.07012004405260086, 0.017248710617423058, -0.030361412093043327, 0.04366294667124748, 0.04912526533007622, -0.10506551712751389, 0.0133575061336

In [27]:
results = chroma_collection.query(
    query_texts=[
        "This is a query about machine learning and data science"
    ],
    n_results=3
)

print(results)

{'ids': [['A Primer on Generative Artificial Intelligence.pdf_2_2576', 'A Primer on Generative Artificial Intelligence.pdf_2_2989', 'A Primer on Generative Artificial Intelligence.pdf_4_630']], 'distances': [[0.8318580389022827, 0.8395542502403259, 0.8667265176773071]], 'embeddings': None, 'metadatas': [[{'filename': 'A Primer on Generative Artificial Intelligence.pdf', 'page': 2, 'source': '../data/A Primer on Generative Artificial Intelligence.pdf', 'start_index': 2576}, {'filename': 'A Primer on Generative Artificial Intelligence.pdf', 'page': 2, 'source': '../data/A Primer on Generative Artificial Intelligence.pdf', 'start_index': 2989}, {'filename': 'A Primer on Generative Artificial Intelligence.pdf', 'page': 4, 'source': '../data/A Primer on Generative Artificial Intelligence.pdf', 'start_index': 630}]], 'documents': [['of machine learning (ML), as under:\n• Machine learning “allows the computer to learn automatically without human inter-\nvention or assistance” ([5], p. 386).\n

In [28]:
def print_dict( dict_item, name):
    print(f"\nDictionary: {name}")
    for key in dict_item.keys():
        print(f"  {key}: {dict_item[key]}")
        
def print_list( list_items, name):
    print(f"\nList: {name}")
    for i, item in enumerate(list_items):
        print(f"  {i}: {item}")

In [29]:
print_dict(results, 'results')


Dictionary: results
  ids: [['A Primer on Generative Artificial Intelligence.pdf_2_2576', 'A Primer on Generative Artificial Intelligence.pdf_2_2989', 'A Primer on Generative Artificial Intelligence.pdf_4_630']]
  distances: [[0.8318580389022827, 0.8395542502403259, 0.8667265176773071]]
  embeddings: None
  metadatas: [[{'filename': 'A Primer on Generative Artificial Intelligence.pdf', 'page': 2, 'source': '../data/A Primer on Generative Artificial Intelligence.pdf', 'start_index': 2576}, {'filename': 'A Primer on Generative Artificial Intelligence.pdf', 'page': 2, 'source': '../data/A Primer on Generative Artificial Intelligence.pdf', 'start_index': 2989}, {'filename': 'A Primer on Generative Artificial Intelligence.pdf', 'page': 4, 'source': '../data/A Primer on Generative Artificial Intelligence.pdf', 'start_index': 630}]]
  documents: [['of machine learning (ML), as under:\n• Machine learning “allows the computer to learn automatically without human inter-\nvention or assistance” 

In [None]:



# Inspect the contents of the first document as well as metadata
#print(documents[0])


# # Load environment variables from a .env file
# load_dotenv()
# # Generate the data store
# generate_data_store(mycollection)

Split 42 documents into 544 chunks.
Citation: Kalota, F. A Primer on
Generative Artificial Intelligence.
Educ. Sci. 2024 ,14, 172. https://
doi.org/10.3390/educsci14020172
Academic Editors: Gary K. W. Wong
and Ho-Yin Cheung
Received: 6 August 2023
Revised: 20 January 2024
Accepted: 24 January 2024
Published: 7 February 2024
{'source': '../data/A Primer on Generative Artificial Intelligence.pdf', 'page': 0, 'start_index': 0}
