<a href="https://colab.research.google.com/github/lcoia/LearningLangChain/blob/main/Chapter2/Chapter2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install langchain langchain-google-genai langchain-community

In [None]:
"""
a-text-loader.py

Note: Files are not saved after the Colab runtime terminates.
Load the sample text file, ECB_policymakers.txt, from the sample_data folder.
Please upload the text file, ECB_policymakers.txt, from the link below to the sample_data folder.

https://drive.google.com/file/d/1pO-DRfmc5KuHIZbD75hffcyAEmICclbL/view?usp=sharing


LangChain Loaders
https://python.langchain.com/api_reference/community/document_loaders.html


Vector Similarity
https://www.pinecone.io/learn/vector-similarity/
"""

from langchain_community.document_loaders import TextLoader

loader = TextLoader('./sample_data/ECB_policymakers.txt', encoding="utf-8")
docs = loader.load()

print(docs)

In [None]:
"""
Beautiful Soup is a library that makes it easy to scrape information from web pages.
This library is required for the next example.

"""

%pip install beautifulsoup4

In [None]:
"""
b-web-loader.py

Load a web page.

"""

from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader('https://www.langchain.com/')
docs = loader.load()

print(docs)

In [None]:
"""
pypdf is a PDF library capable of splitting, merging, cropping, and transforming the pages of PDF files.
This library is required for the next example.

"""

%pip install pypdf

In [None]:
"""
c-pdf-loader.py

Note: Files are not saved after the Colab runtime terminates.
Please upload the PDF file from the link below to the sample_data folder.

https://www.babson.edu/media/babson/assets/cutler-center/Introduciton-to-Technical-Analysis.pdf

"""
import pprint
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('./sample_data/Introduciton-to-Technical-Analysis.pdf')
pages = loader.load()

pprint.pprint(pages)

In [None]:
"""
d-rec-text-splitter.py

Split the document into chunks to fit in the context window of the LLM.
Keep semantically related chunks together.

LangChain Text Splitters
https://python.langchain.com/docs/concepts/text_splitters/


Late chunking for better semantic context.
https://www.datacamp.com/tutorial/late-chunking
https://docs.chonkie.ai/chunkers/overview

MTEB Embedding Models (Massive Text Embedding Benchmark)
https://modal.com/blog/mteb-leaderboard-article

"""

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
import pprint

loader = TextLoader('./sample_data/ECB_policymakers.txt', encoding="utf-8")
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splitted_docs = splitter.split_documents(docs)

pprint.pprint(splitted_docs)


In [None]:
"""
e-rec-text-splitter-code.py

Split code languages and Markdown into semantic chunks.

"""

from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

PYTHON_CODE = """ def hello_world(): print("Hello, World!") # Call the function hello_world() """

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)

python_docs = python_splitter.create_documents([PYTHON_CODE])

print(python_docs)

In [None]:
"""
g-embeddings.py

Generating text embeddings.

"""

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from google.colab import userdata

# Note: Google model names must be prefixed with 'models/'
model = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=userdata.get('GOOGLE_API_KEY'))
embeddings = model.embed_documents([
    "Hi there!",
    "Oh, hello!",
    "What's your name?",
    "My friends call me World",
    "Hello World!"
])

print(embeddings)


In [None]:
"""
h-load-split-embed.py

Load, split, and generate embeddings.
"""

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from google.colab import userdata

# Load the document
loader = TextLoader("./sample_data/ECB_policymakers.txt", encoding="utf-8")
doc = loader.load()

# Split the document
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(doc)

# Generate embeddings
embeddings_model = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=userdata.get('GOOGLE_API_KEY'))
embeddings = embeddings_model.embed_documents(
    [chunk.page_content for chunk in chunks]
)

print(embeddings)

In [None]:
"""
Substituting Qdrant for Postgres as a vector store.


"""
!pip install langchain-qdrant


In [None]:
"""
j-record-manager.py


"""

from langchain.indexes import SQLRecordManager, index
from langchain_qdrant import QdrantVectorStore
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.docstore.document import Document
from google.colab import userdata


embeddings_model = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=userdata.get('GOOGLE_API_KEY'))

client = QdrantClient(":memory:")
client.create_collection(
    collection_name="demo_collection",
    vectors_config=Vectorparams(size=3072, distance=Distance.COSINE))

vectorstore = QdrantVectorStore(
    embeddings=embeddings_model,
    collection_name="demo_collection",
    client=client,
)

record_manager = SQLRecordManager(
    namespace,
    db_url="postgresql+psycopg://langchain:langchain@localhost:6024/langchain",
)

# Create the schema if it doesn't exist
record_manager.create_schema()

# Create documents
docs = [
    Document(page_content='there are cats in the pond', metadata={
             "id": 1, "source": "cats.txt"}),
    Document(page_content='ducks are also found in the pond', metadata={
             "id": 2, "source": "ducks.txt"}),
]

# Index the documents
index_1 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",  # prevent duplicate documents
    source_id_key="source",  # use the source field as the source_id
)

print("Index attempt 1:", index_1)

# second time you attempt to index, it will not add the documents again
index_2 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

print("Index attempt 2:", index_2)

# If we mutate a document, the new version will be written and all old versions sharing the same source will be deleted.

docs[0].page_content = "I just modified this document!"

index_3 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

print("Index attempt 3:", index_3)


OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable