In [32]:
import sys
sys.path.append("'/Users/A118390615/Library/CloudStorage/OneDrive-DeutscheTelekomAG/Projects/COE_Projects/langchain_EM_course1/intro_to_vector_dbs/")

import os
from dotenv import load_dotenv
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY_PERSONAL"]

from langchain_community.document_loaders.text import TextLoader
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain import hub
from langchain_core.prompts.prompt import PromptTemplate
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

In [30]:
loader = TextLoader(r"data/mediumArticle.txt")
text = loader.load()

splitter = CharacterTextSplitter(separator = "\n\n", 
                                 chunk_size = 1000, 
                                 chunk_overlap = 100)
text_splits = splitter.split_documents(text)
embeddings = OpenAIEmbeddings(model = 'text-embedding-3-small')
print("Ingestion")
vector_store = PineconeVectorStore(index_name="medium-blogs-embedding-index",
                                   embedding = embeddings)
vector_store.add_documents(text_splits)
print("Ingestion Finish")

print("Retrieval paradigm")
llm = ChatOpenAI(model = "gpt-4o-mini", temperature = 0)
rag_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
retrieval_qa_chain = create_stuff_documents_chain(llm, rag_prompt)
retrieval_chain = create_retrieval_chain(retriever = vector_store.as_retriever(), combine_docs_chain = retrieval_qa_chain)

Ingestion
Ingestion Finish
Retrieval paradigm


In [36]:
sample_Result = retrieval_chain.invoke({"input": "How can i use a vector DB"})

In [38]:
print(sample_Result['answer'])

To use a Vector Database (VectorDB), you can follow these steps:

1. **Install the Required Libraries**: You need a library like faiss (Facebook AI Similarity Search) or Annoy (Approximate Nearest Neighbors Oh Yeah). For this example, you can install faiss using the following command:
   ```
   pip install faiss-cpu
   ```

2. **Import the Library**: In your Python script, import the necessary libraries:
   ```python
   import faiss
   import numpy as np
   ```

3. **Create Some Vector Data**: Generate sample vectors that you want to store in the VectorDB. For example:
   ```python
   d = 128  # Dimension of vectors
   nb = 1000  # Number of vectors
   np.random.seed(1234)
   vectors = np.random.random((nb, d)).astype('float32')
   ```

4. **Build the Index**: Create an index to store the vectors and make similarity searches efficient:
   ```python
   index = faiss.IndexFlatL2(d)  # L2 distance
   index.add(vectors)  # Add vectors to the index
   print(f"Number of vectors in the index:

In [26]:
rag_prompt.messages[0]

SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='Answer any use questions based solely on the context below:\n\n<context>\n{context}\n</context>'), additional_kwargs={})

In [21]:
rag_prompt

[ChatPromptTemplate(input_variables=['context', 'input'], optional_variables=['chat_history'], input_types={'chat_history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annotated[langchain_core.messages.tool.ToolMessage, Tag(tag='tool')], typing.Annotated[langchain_core.messages.ai.AIMessageChunk, Tag(tag='AIMessageChunk')], typing.Annotated[langchain_core.messages.human.HumanMessageChunk, Tag(tag='HumanMessageChunk')], typing.Annotated[langchain_core.messages.chat.ChatMessageChunk, Tag(tag='ChatMessageChunk')], typing.Annotated[langchain_core.messages.system.SystemMessageChunk, Tag(tag='SystemMe

In [12]:
vector_store.similarity_search(query = "best options to Pinecone")

[Document(id='28a6479e-c706-401a-8fea-847448e89bc3', metadata={'source': 'data/mediumArticle.txt'}, page_content='Example Technologies and Tools\n\n    FAISS (Facebook AI Similarity Search):\n\n    A library for efficient similarity search and clustering of dense vectors.\n    Supports a range of indexing algorithms and is optimized for both CPU and GPU.\n\n    Annoy (Approximate Nearest Neighbors Oh Yeah):\n\n    A library for approximate nearest neighbor search, particularly useful for large, read-only datasets.\n    Uses a combination of random projections and trees for indexing.\n\n    HNSW (Hierarchical Navigable Small World):\n\n    An algorithm for efficient approximate nearest neighbor search.\n    Utilizes a graph-based structure for fast and scalable search.\n\n    Milvus:\n\n    An open-source vector database designed for scalable similarity search.\n    Supports hybrid search combining vector and traditional databases.\n\nSteps to Use VectorDB\n1. Install the Required Libra