### ChromaDB

In [1]:
pip install chromadb

Note: you may need to restart the kernel to use updated packages.


In [2]:
import chromadb
client = chromadb.PersistentClient(path="./chroma_db")

In [3]:
collection = client.create_collection(name="collection1")

# Insert sample data (ID, embeddings, metadata)
collection.add(
    ids=["1", "2", "3"],
    embeddings=[[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5]],
    metadatas=[{"name": "Keerti"}, {"name": "Amit"}, {"name": "Educosys"}]
)

In [4]:
print("Available Collections:", client.list_collections())

Available Collections: ['collection1']


In [5]:
print("Fetching data with ID 1:", collection.get(ids=["2"]))

Fetching data with ID 1: {'ids': ['2'], 'embeddings': None, 'documents': [None], 'uris': None, 'data': None, 'metadatas': [{'name': 'Amit'}], 'included': [<IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [6]:
print("Fetching data with ID 1:", collection.get(ids=["2"], include=["embeddings", "metadatas"]))

Fetching data with ID 1: {'ids': ['2'], 'embeddings': array([[0.2       , 0.30000001, 0.40000001]]), 'documents': None, 'uris': None, 'data': None, 'metadatas': [{'name': 'Amit'}], 'included': [<IncludeEnum.embeddings: 'embeddings'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [7]:
collection.add(
    ids=["4"],
    embeddings=[[0.1, 0.2, 0.3]],
    documents=["Someone is a software engineer with 5 years of experience."]
)

In [10]:
collection.query(
    query_embeddings=[[0.3, 0.4, 0.5]],
    n_results=2,
    include=["documents"]
)

{'ids': [['3', '2']],
 'embeddings': None,
 'documents': [[None, None]],
 'uris': None,
 'data': None,
 'metadatas': None,
 'distances': None,
 'included': [<IncludeEnum.documents: 'documents'>]}

In [11]:
collection.update(
    ids=["2"],
    embeddings=[[0.5, 0.5, 0.5]],
    metadatas=[{"name": "Bob Updated"}]
)
print("Updated Entry:", collection.get(ids=["2"]))

Updated Entry: {'ids': ['2'], 'embeddings': None, 'documents': [None], 'uris': None, 'data': None, 'metadatas': [{'name': 'Bob Updated'}], 'included': [<IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [12]:
print("Fetching data with ID 1:", collection.get(ids=["2"]))

Fetching data with ID 1: {'ids': ['2'], 'embeddings': None, 'documents': [None], 'uris': None, 'data': None, 'metadatas': [{'name': 'Bob Updated'}], 'included': [<IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [None]:
collection.delete(ids=["3"])
print("After Deletion:", collection.get(ids=["3"]))

After Deletion: {'ids': [], 'embeddings': None, 'documents': [], 'uris': None, 'data': None, 'metadatas': [], 'included': [<IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}
Available Collections: ['collection1']


In [14]:
for collection in client.list_collections():
    client.delete_collection(collection)

In [15]:
print("Available Collections:", client.list_collections())

Available Collections: []


### **_OpenAI Embeddings_**

In [17]:
!pip install python-dotenv openai



In [16]:
client = chromadb.PersistentClient(path="./chroma_db")  # This persists data
collection = client.get_or_create_collection(name="collection2")

In [18]:
from dotenv import load_dotenv

load_dotenv()  # Load environment variables from .env file

import os
api_key = os.getenv('OPENAI_API_KEY')  # Retrieve the key

In [19]:
import openai
def get_openai_embedding(text):
    response = openai.embeddings.create(input=[text], model="text-embedding-ada-002") # OpenAI's embedding model
    return response.data[0].embedding

In [20]:
documents = [
    "The Eiffel Tower is located in Paris.",
    "The Colosseum is in Rome, Italy.",
    "The Taj Mahal is a famous monument in India.",
    "Mount Everest is the highest mountain in the world.",
    "Python is a popular programming language."
]

# Convert documents to embeddings
embeddings = [get_openai_embedding(doc) for doc in documents]

# Insert into ChromaDB
collection.add(
    ids=[str(i) for i in range(len(documents))],  # Unique IDs
    documents=documents,
    embeddings=embeddings
)

print("Data added successfully!")

Data added successfully!


In [23]:
query_text = "Where is the Eiffel Tower?"
query_embedding = get_openai_embedding(query_text)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2,  # Get top 2 matches
    include=["documents", "distances"]
)

print("Query:", query_text)
print("Most Similar Result:", results["documents"][0])
print("Distance:", results["distances"][0])

Query: Where is the Eiffel Tower?
Most Similar Result: ['The Eiffel Tower is located in Paris.', 'The Colosseum is in Rome, Italy.']
Distance: [0.12595291410076773, 0.38459610098181296]


In [24]:
updated_text = "The Eiffel Tower is one of the most visited landmarks in the world."
updated_embedding = get_openai_embedding(updated_text)

collection.update(
    ids=["0"],  # ID of the document to update
    documents=[updated_text],
    embeddings=[updated_embedding]
)

print("Data updated successfully!")

Data updated successfully!


In [25]:
query_text = "Where is the Eiffel Tower?"
query_embedding = get_openai_embedding(query_text)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2,  # Get top 2 matches
    include=["documents", "distances"]
)

print("Query:", query_text)
print("Most Similar Result:", results["documents"][0])
print("Distance:", results["distances"][0])

Query: Where is the Eiffel Tower?
Most Similar Result: ['The Eiffel Tower is one of the most visited landmarks in the world.', 'The Colosseum is in Rome, Italy.']
Distance: [0.18844830998746137, 0.38459610098181296]


In [26]:
tower_ht_text = "Eiffel Tower is 330 tall."

collection.add(
    ids=["6"],
    embeddings=get_openai_embedding(tower_ht_text),
    documents=tower_ht_text
)

In [27]:
query_text = "Where is the Eiffel Tower?"
query_embedding = get_openai_embedding(query_text)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2,  # Get top 2 matches
    include=["documents", "distances"]
)

print("Query:", query_text)
print("Most Similar Result:", results["documents"][0])
print("Distance:", results["distances"][0])

Query: Where is the Eiffel Tower?
Most Similar Result: ['The Eiffel Tower is one of the most visited landmarks in the world.', 'Eiffel Tower is 330 tall.']
Distance: [0.18844830998746137, 0.22631516545745847]


In [28]:
collection.delete(ids=["0"])  # Delete document with ID "0"
print("Data deleted successfully! ❌")

Data deleted successfully! ❌


In [29]:
query_text = "Where is the Eiffel Tower?"
query_embedding = get_openai_embedding(query_text)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2,  # Get top 2 matches
    include=["documents", "distances"]
)

print("Query:", query_text)
print("Most Similar Result:", results["documents"][0])
print("Distance:", results["distances"][0])

Query: Where is the Eiffel Tower?
Most Similar Result: ['Eiffel Tower is 330 tall.', 'The Colosseum is in Rome, Italy.']
Distance: [0.22631516545745847, 0.38459610098181296]


### **RAG**

In [58]:
from dotenv import load_dotenv

load_dotenv()  # Load environment variables from .env file

import os
os.getenv('OPENAI_API_KEY') 
os.getenv('LANGCHAIN_TRACING_V2') 
os.getenv('LANGCHAIN_API_KEY') 
os.getenv('LANGSMITH_PROJECT') 

'ragclass'

In [30]:
!pip install langchain_community langchain-openai langchainhub chromadb langchain



In [31]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader(
    web_paths=["https://www.educosys.com/course/genai"]
)
docs = loader.load()
print(docs)

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://www.educosys.com/course/genai', 'title': 'Hands-on Generative AI Course', 'description': 'Hands-on Generative AI Course', 'language': 'en'}, page_content='Hands-on Generative AI CourseJoin ongoing LIVE Hands-On Generative AI courseEnroll NowCoursesBundle CoursesMentorFree ContentTestimonialsFAQLogin Signup Currently LIVEHands-on Generative AI CourseLearn, Build, Deploy and Apply Generative AI7 Weeks CourseClasses on Tue, Wed, Thurs (9PM IST - 11PM IST)Access all Live BatchesLifetime access of RecordingsAccess Discord CommunityCode availableBuild ProjectsLearn Future-Ready TechEnroll 1Week 1Foundations of Generative AI Introduction to AI Mathematical Foundations for AI Probability, Statistics, and Linear Algebra Basics of Neural Networks Gradient Descent and Optimization Basics Architectures: Feedforward, RNN, and CNN Mini Project 1 - Build a Simple Neural Network Using TensorFlow Mini Project 2 - Train an Autoencoder on the MNIST Dataset2Week 2Dee

In [32]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split
# Overlap of 200 characters to maintain context across chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [38]:
print(splits[0])
print(splits[1])
print(splits[2])

page_content='Hands-on Generative AI CourseJoin ongoing LIVE Hands-On Generative AI courseEnroll NowCoursesBundle CoursesMentorFree ContentTestimonialsFAQLogin Signup Currently LIVEHands-on Generative AI CourseLearn, Build, Deploy and Apply Generative AI7 Weeks CourseClasses on Tue, Wed, Thurs (9PM IST - 11PM IST)Access all Live BatchesLifetime access of RecordingsAccess Discord CommunityCode availableBuild ProjectsLearn Future-Ready TechEnroll 1Week 1Foundations of Generative AI Introduction to AI Mathematical Foundations for AI Probability, Statistics, and Linear Algebra Basics of Neural Networks Gradient Descent and Optimization Basics Architectures: Feedforward, RNN, and CNN Mini Project 1 - Build a Simple Neural Network Using TensorFlow Mini Project 2 - Train an Autoencoder on the MNIST Dataset2Week 2Deep Generative Models Discriminative and Generative models Generative Adversarial Networks (GANs) DCGAN and StyleGAN Variational Autoencoders (VAEs) Probabilistic Data Generation Usi

In [39]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=OpenAIEmbeddings())

In [40]:
print(vectorstore._collection.count())  # Check total stored chunks

4


In [41]:
print(vectorstore._collection.get())

{'ids': ['1829471b-ca21-4bf7-a225-33eb8c5278ca', '9f657e1b-cdc1-4b3c-bfac-161c0177ca0f', '67062071-3860-481b-9355-711117d658ad', '7221ceda-f647-42e0-872c-f3ad210d0a0a'], 'embeddings': None, 'documents': ['Hands-on Generative AI CourseJoin ongoing LIVE Hands-On Generative AI courseEnroll NowCoursesBundle CoursesMentorFree ContentTestimonialsFAQLogin Signup Currently LIVEHands-on Generative AI CourseLearn, Build, Deploy and Apply Generative AI7 Weeks CourseClasses on Tue, Wed, Thurs (9PM IST - 11PM IST)Access all Live BatchesLifetime access of RecordingsAccess Discord CommunityCode availableBuild ProjectsLearn Future-Ready TechEnroll 1Week 1Foundations of Generative AI Introduction to AI Mathematical Foundations for AI Probability, Statistics, and Linear Algebra Basics of Neural Networks Gradient Descent and Optimization Basics Architectures: Feedforward, RNN, and CNN Mini Project 1 - Build a Simple Neural Network Using TensorFlow Mini Project 2 - Train an Autoencoder on the MNIST Datase

In [44]:
print('\nCollection 1 - ', vectorstore._collection.get(ids=['1829471b-ca21-4bf7-a225-33eb8c5278ca'], include=["embeddings", "documents"]))
print('\nCollection 2 - ', vectorstore._collection.get(ids=['9f657e1b-cdc1-4b3c-bfac-161c0177ca0f'], include=["embeddings", "documents"]))
print('\nCollection 3 - ', vectorstore._collection.get(ids=['67062071-3860-481b-9355-711117d658ad'], include=["embeddings", "documents"]))
print('\nCollection 4 - ', vectorstore._collection.get(ids=['7221ceda-f647-42e0-872c-f3ad210d0a0a'], include=["embeddings", "documents"]))


Collection 1 -  {'ids': ['1829471b-ca21-4bf7-a225-33eb8c5278ca'], 'embeddings': array([[-0.01335995, -0.02595686,  0.00693545, ..., -0.03959613,
         0.0069082 , -0.01369378]]), 'documents': ['Hands-on Generative AI CourseJoin ongoing LIVE Hands-On Generative AI courseEnroll NowCoursesBundle CoursesMentorFree ContentTestimonialsFAQLogin Signup Currently LIVEHands-on Generative AI CourseLearn, Build, Deploy and Apply Generative AI7 Weeks CourseClasses on Tue, Wed, Thurs (9PM IST - 11PM IST)Access all Live BatchesLifetime access of RecordingsAccess Discord CommunityCode availableBuild ProjectsLearn Future-Ready TechEnroll 1Week 1Foundations of Generative AI Introduction to AI Mathematical Foundations for AI Probability, Statistics, and Linear Algebra Basics of Neural Networks Gradient Descent and Optimization Basics Architectures: Feedforward, RNN, and CNN Mini Project 1 - Build a Simple Neural Network Using TensorFlow Mini Project 2 - Train an Autoencoder on the MNIST Dataset2Week 

In [45]:
retriever = vectorstore.as_retriever()

In [46]:
from langchain import hub
# Prompt
prompt = hub.pull("rlm/rag-prompt") # pulls a predefined RAG prompt template from LangChain Hub



In [None]:
# You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
# Question: {question} 
# Context: {context} 
# Answer:

In [47]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo")

In [52]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [54]:
rag_chain = ({"context" : retriever | format_docs, "question": RunnablePassthrough()}
             | prompt 
             | llm
             | StrOutputParser())


In [55]:
rag_chain.invoke("What are the timings of the genai course?")

'Classes for the Generative AI course are held on Tue, Wed, and Thurs from 9 PM IST to 11 PM IST. The course spans over 7 weeks and covers a variety of topics related to Generative AI. Students also have access to live batches and lifetime recordings of the classes.'

In [56]:
rag_chain.invoke("Give me the curriculum for week 1 for genai course")

'The curriculum for Week 1 of the GenAI course includes Foundations of Generative AI, Introduction to AI, Mathematical Foundations for AI, Basics of Neural Networks, and Mini Projects using TensorFlow and the MNIST Dataset.\n'

In [57]:
rag_chain.invoke("Are the recordings for the course available?")

'Yes, the course offers recordings of the classes. The course covers a range of topics in Generative AI over a span of 7 weeks, with live classes on specific days. Certificates are provided upon completion of the course.'

In [59]:
from langchain_core.runnables import RunnableLambda

In [60]:
def print_prompt(prompt_text):
    print("\nPrompt - ", prompt_text)
    return prompt_text

rag_chain = ({"context" : retriever | format_docs, "question": RunnablePassthrough()}
             | prompt 
             | RunnableLambda(print_prompt)
             | llm
             | StrOutputParser())

In [61]:
rag_chain.invoke("What are the timings of the genai course?")


Prompt -  messages=[HumanMessage(content="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: What are the timings of the genai course? \nContext: Hands-on Generative AI CourseJoin ongoing LIVE Hands-On Generative AI courseEnroll NowCoursesBundle CoursesMentorFree ContentTestimonialsFAQLogin Signup Currently LIVEHands-on Generative AI CourseLearn, Build, Deploy and Apply Generative AI7 Weeks CourseClasses on Tue, Wed, Thurs (9PM IST - 11PM IST)Access all Live BatchesLifetime access of RecordingsAccess Discord CommunityCode availableBuild ProjectsLearn Future-Ready TechEnroll 1Week 1Foundations of Generative AI Introduction to AI Mathematical Foundations for AI Probability, Statistics, and Linear Algebra Basics of Neural Networks Gradient Descent and Optimization Basics Architectures: Feedfor

'Classes for the Genai course are held on Tue, Wed, and Thurs from 9PM IST to 11PM IST. The course lasts for 7 weeks and covers topics ranging from foundational concepts to advanced projects. Access to all live batches and recordings is provided.'