In [None]:
# pip install -U langchain-openai` and import as `from langchain_openai import OpenAIEmbeddings`

In [19]:
%pip install langchain langchain-community pypdf openai chromadb tiktoken
%pip install -U langchain-openai

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting langchain-openai
  Downloading langchain_openai-0.1.20-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 KB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: langchain-openai
Successfully installed langchain-openai-0.1.20
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Langchain dependencies
from langchain.document_loaders.pdf import PyPDFDirectoryLoader # Importing PDF loader from Langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain
# from langchain.embeddings import OpenAIEmbeddings # Importing OpenAI embeddings from Langchain
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document # Importing Document schema from Langchain
from langchain.vectorstores.chroma import Chroma # Importing Chroma vector store from Langchain
from langchain.chat_models import ChatOpenAI # Import OpenAI LLM
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv # Importing dotenv to get API key from .env file
import os # Importing os module for operating system functionalities
import shutil # Importing shutil module for high-level file operations

In [2]:
# Directory to your pdf files:
DATA_PATH = "./data/"
def load_documents():
  """
  Load PDF documents from the specified directory using PyPDFDirectoryLoader.
  Returns:
  List of Document objects: Loaded PDF documents represented as Langchain
                                                          Document objects.
  """
  # Initialize PDF loader with specified directory
  document_loader = PyPDFDirectoryLoader(DATA_PATH) 
  # Load PDF documents and return them as a list of Document objects
  return document_loader.load() 

In [3]:
documents = load_documents() # Call the function
# Inspect the contents of the first document as well as metadata
print(documents[0])

page_content='Ski
Boots
TDBootz
Special
This
guide
explains
the
special
consider ation
for
choosing
TDBootz
Special
Ski
boots.
1.
Understanding
Ski
Boot
Components:
Befor e
delving
int o
selecting
ski
boots,
it' s
essential
t o
understand
the
v arious
components
that
mak e
up
a
ski
boot:
●
Shell:
The
outer
body
of
the
boot,
typically
made
of
plastic,
which
pr o vides
structur e
and
suppor t.
●
Liner:
The
inner
la y er
of
the
boot,
usually
made
of
foam,
which
pr o vides
cushioning,
insulation,
and
comfor t.
●
Buckles:
Ski
boots
usually
ha v e
se v er al
buckles
or
str aps
t o
secur e
the
boot
tightly
ar ound
y our
foot
and
leg.
●
Cuff:
The
upper
par t
of
the
boot
that
wr aps
ar ound
y our
lower
leg,
pr o viding
suppor t
and
tr ansmitting
mo v ements
t o
the
skis.
●
Sole:
The
bott om
of
the
boot,
which
inter faces
with
the
ski
bindings.
●
Ski/W alk
Mode:
Many
modern
ski
boots
f eatur e
a
mechanism
that
allows
y ou
t o
switch
between
a
stiff,
lock ed
position
for
skiing
and
a
mor e
ﬂexibl

In [4]:
chunk_size = 300
chunk_overlap=100 # Overlap between consecutive chunks

def split_text(documents: list[Document]):
  """
  Split the text content of the given list of Document objects into smaller chunks.
  Args:
    documents (list[Document]): List of Document objects containing text content to split.
  Returns:
    list[Document]: List of Document objects representing the split text chunks.
  """
  # Initialize text splitter with specified parameters
  text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, # Size of each chunk in characters
    chunk_overlap=chunk_overlap, # Overlap between consecutive chunks
    length_function=len, # Function to compute the length of the text
    add_start_index=True, # Flag to add start index to each chunk
  )

  # Split documents into smaller chunks using text splitter
  chunks = text_splitter.split_documents(documents)
  print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

  # Print example of page content and metadata for a chunk
  document = chunks[0]
  print(document.page_content)
  print(document.metadata)

  return chunks # Return the list of split text chunks

In [7]:
import chromadb
chroma_client = chromadb.HttpClient(host='localhost', port=8000)


In [8]:
def save_to_chroma(chunks: list[Document]):
  """
  Save the given list of Document objects to a Chroma database.
  Args:
  chunks (list[Document]): List of Document objects representing text chunks to save.
  Returns:
  None
  """

  # Create a new Chroma database from the documents using OpenAI embeddings
  db = Chroma.from_documents(
    chunks,
    OpenAIEmbeddings(),
    client=chroma_client,
  )

  # Persist the database to disk
  # db.persist() # TODO: Delete this line
  print(f"Saved {len(chunks)} chunks.")

In [10]:
def generate_data_store():
  """
  Function to generate vector database in chroma from documents.
  """
  documents = load_documents() # Load documents from a source
  chunks = split_text(documents) # Split documents into manageable chunks
  save_to_chroma(chunks) # Save the processed data to a data store

# Load environment variables from a .env file
load_dotenv(override=True)
# Generate the data store
generate_data_store()

Split 15 documents into 139 chunks.
Ski
Boots
TDBootz
Special
This
guide
explains
the
special
consider ation
for
choosing
TDBootz
Special
Ski
boots.
1.
Understanding
Ski
Boot
Components:
Befor e
delving
int o
selecting
ski
boots,
it' s
essential
t o
understand
the
v arious
components
that
mak e
up
a
ski
boot:
●
Shell:
The
outer
body
{'source': 'data/Ski_Boots_TDBootz_Special.pdf', 'page': 0, 'start_index': 0}
Saved 139 chunks to chroma.


In [11]:
query_text = "List the steps to assemble a bike."

In [12]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
 - -
Answer the question based on the above context: {question}
"""

In [16]:
def query_rag(query_text, base_url: str = None):
  """
  Query a Retrieval-Augmented Generation (RAG) system using Chroma database and OpenAI.
  Args:
    - query_text (str): The text to query the RAG system with.
  Returns:
    - formatted_response (str): Formatted response including the generated text and sources.
    - response_text (str): The generated response text.
  """

  test_dic = {"ss": "dd"}
  # YOU MUST - Use same embedding function as before
  embedding_function = OpenAIEmbeddings(base_url=base_url, api_key='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')

  # Prepare the database
  db = Chroma(client=chroma_client, embedding_function=embedding_function)
  
  # Retrieving the context from the DB using similarity search
  results = db.similarity_search_with_relevance_scores(query_text, k=3)

  # Check if there are any matching results or if the relevance score is too low
  if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results.")

  # Combine context from matching documents
  context_text = "\n\n - -\n\n".join([doc.page_content for doc, _score in results])
 
  # Create prompt template using context and query text
  prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
  prompt = prompt_template.format(context=context_text, question=query_text)
  
  # Initialize OpenAI chat model
  model = ChatOpenAI()

  # Generate response text based on the prompt
  response_text = model.predict(prompt)
 
   # Get sources of the matching documents
  sources = [doc.metadata.get("source", None) for doc, _score in results]
 
  # Format and return response including generated text and sources
  formatted_response = f"Response: {response_text}\nSources: {sources}"
  return formatted_response, response_text

# Let's call our function we have defined
formatted_response, response_text = query_rag(query_text)
# and finally, inspect our final response!
print(response_text)

1. Unbox the bike and lay out all the components.
2. Attach the front wheel to the fork, ensuring it is securely fastened with the provided axle and nut.
3. Adjust the handlebar height.
4. Carefully unpack all components to ensure nothing is damaged.
5. Attach the handlebars to the stem and tighten the bolts securely.
6. Install the front wheel, ensuring it is centered and secured with quick-release or axle nuts.
7. Adjust the seat and maintain your bike.
