https://python.langchain.com/docs/tutorials/retrievers/

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
from openai import OpenAI
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage
from langgraph.prebuilt import create_react_agent
from langchain_core.messages import HumanMessage, SystemMessage
from enum import Enum
from langchain_google_genai import ChatGoogleGenerativeAI

# https://python.langchain.com/docs/tutorials/agents/

OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
TAVILY_API_KEY=os.getenv("TVLY_API_KEY")
TAVILY_API_KEY=os.getenv("TAVILY_API_KEY")

GOOGLE_API_KEY=os.getenv("GOOGLE_API_KEY")
LANGSMITH_API_KEY=os.getenv("LANGSMITH_API_KEY")
NVIDIA_API_KEY=os.getenv("NVIDIA_API_KEY")

In [None]:
print(bool(OPENAI_API_KEY))
print(bool(GOOGLE_API_KEY))
print(bool(NVIDIA_API_KEY))

In [None]:
from langchain_core.documents import Document

documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

In [None]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "nke-10k-2023.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

In [None]:
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from langchain_openai import OpenAIEmbeddings

#embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [None]:
ids = vector_store.add_documents(documents=all_splits)

In [None]:
results = vector_store.similarity_search(
    "How many distribution centers does Nike have in the US?"
)

print(results[0])

In [None]:
results = await vector_store.asimilarity_search("When was Nike incorporated?")

print(results[0])

In [None]:
# Note that providers implement different scores; the score here
# is a distance metric that varies inversely with similarity.

results = vector_store.similarity_search_with_score("What was Nike's revenue in 2023?")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)

In [None]:
# Note that providers implement different scores; the score here
# is a distance metric that varies inversely with similarity.

results = vector_store.similarity_search_with_score("What was Nike's revenue in 2023?")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)

In [None]:
embedding = embeddings.embed_query("How were Nike's margins impacted in 2023?")

results = vector_store.similarity_search_by_vector(embedding)
print(results[0])

Retrievers

In [None]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain


@chain
def retriever(query: str) -> List[Document]:
    return vector_store.similarity_search(query, k=1)


retriever.batch(
    [
        "How many distribution centers does Nike have in the US?",
        "When was Nike incorporated?",
    ],
)

In [None]:
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 1},
)

retriever.batch(
    [
        "How many distribution centers does Nike have in the US?",
        "When was Nike incorporated?",
    ],
)