## Converting Your Documents into Text

In [1]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./test.txt")
loader.load()

[Document(metadata={'source': './test.txt'}, page_content='Matthew 5:3-12 (The Beatitudes)\n\n3 "Blessed are the poor in spirit, for theirs is the kingdom of heaven. 4 Blessed are those who mourn, for they will be comforted. 5 Blessed are the meek, for they will inherit the earth. 6 Blessed are those who hunger and thirst for righteousness, for they will be filled. 7 Blessed are the merciful, for they will be shown mercy. 8 Blessed are the pure in heart, for they will see God. 9 Blessed are the peacemakers, for they will be called children of God. 10 Blessed are those who are persecuted because of righteousness, for theirs is the kingdom of heaven. 11 Blessed are you when people insult you, persecute you and falsely say all kinds of evil against you because of me. 12 Rejoice and be glad, because great is your reward in heaven, for in the same way they persecuted the prophets who were before you.\n\nMatthew 5:13-16 (Salt and Light)\n\n13 "You are the salt of the earth. But if the salt l

In [2]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://www.langchain.com/")
loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://www.langchain.com/', 'title': 'LangChain', 'description': 'LangChain’s suite of products supports developers along each step of their development journey.', 'language': 'en'}, page_content="LangChain\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nProducts\n\nLangGraphLangSmithLangChainResources\n\nResources HubBlogCustomer StoriesLangChain AcademyCommunityExpertsChangelogDocs\n\nPythonLangGraphLangSmithLangChainJavaScriptLangGraphLangSmithLangChainCompany\n\nAboutCareersPricing\n\nLangSmithLangGraph PlatformGet a demoSign up\n\n\n\n\n\n\n\n\n\n\n\n\nProducts\n\nLangGraphLangSmithLangChainResources\n\nResources HubBlogCustomer StoriesLangChain AcademyCommunityExpertsChangelogDocs\n\nPythonLangGraphLangSmithLangChainJavaScriptLangGraphLangSmithLangChainCompany\n\nAboutCareersPricing\n\nLangSmithLangGraph PlatformGet a demoSign upLangChain’s suite of products supports developers along each step of the LLM application lifecycle.Applications that can r

## Chunks

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = TextLoader("./test.txt")
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
splitter.split_documents(docs)

[Document(metadata={'source': './test.txt'}, page_content='Matthew 5:3-12 (The Beatitudes)\n\n3 "Blessed are the poor in spirit, for theirs is the kingdom of heaven. 4 Blessed are those who mourn, for they will be comforted. 5 Blessed are the meek, for they will inherit the earth. 6 Blessed are those who hunger and thirst for righteousness, for they will be filled. 7 Blessed are the merciful, for they will be shown mercy. 8 Blessed are the pure in heart, for they will see God. 9 Blessed are the peacemakers, for they will be called children of God. 10 Blessed are those who are persecuted because of righteousness, for theirs is the kingdom of heaven. 11 Blessed are you when people insult you, persecute you and falsely say all kinds of evil against you because of me. 12 Rejoice and be glad, because great is your reward in heaven, for in the same way they persecuted the prophets who were before you.\n\nMatthew 5:13-16 (Salt and Light)'),
 Document(metadata={'source': './test.txt'}, page_co

In [5]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

PYTHON_CODE = """
def hello_world():
    print("Hello, World!")

# Call the function
hello_world()
"""
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)
python_splitter.create_documents([PYTHON_CODE])

[Document(metadata={}, page_content='def hello_world():\n    print("Hello, World!")'),
 Document(metadata={}, page_content='# Call the function\nhello_world()')]

In [6]:
markdown_text = """
# LangChain

⚡ Building applications with LLMs through composability ⚡

## Quick Install

```bash
pip install langchain
```

As an open source project in a rapidly developing field, we are extremely open 
    to contributions.
"""

md_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0
)
md_splitter.create_documents([markdown_text], 
    [{"source": "https://www.langchain.com"}])


[Document(metadata={'source': 'https://www.langchain.com'}, page_content='# LangChain'),
 Document(metadata={'source': 'https://www.langchain.com'}, page_content='⚡ Building applications with LLMs through composability ⚡'),
 Document(metadata={'source': 'https://www.langchain.com'}, page_content='## Quick Install\n\n```bash\npip install langchain'),
 Document(metadata={'source': 'https://www.langchain.com'}, page_content='```'),
 Document(metadata={'source': 'https://www.langchain.com'}, page_content='As an open source project in a rapidly developing field, we'),
 Document(metadata={'source': 'https://www.langchain.com'}, page_content='are extremely open'),
 Document(metadata={'source': 'https://www.langchain.com'}, page_content='to contributions.')]

In [8]:
import json
import openai
# Read the configuration file
with open('config.json') as f:
    config = json.load(f)
# Get the API key from the configuration
api_key = config['OPENAI_API_KEY']
# Set up the OpenAI API client
openai.api_key = api_key

In [9]:
from langchain_openai import OpenAIEmbeddings

model = OpenAIEmbeddings(api_key=api_key)

embeddings = model.embed_documents([
    "Hi there!",
    "Oh, hello!",
    "What's your name?",
    "My friends call me World",
    "Hello World!"
])

In [10]:
embeddings

[[-0.020325319841504097,
  -0.007096723187714815,
  -0.022839006036520004,
  -0.026279456913471222,
  -0.037527572363615036,
  0.02163294516503811,
  -0.006144568789750338,
  -0.008975640870630741,
  0.008524954319000244,
  -0.016618264839053154,
  0.02683805488049984,
  -0.007356978487223387,
  -0.013545980677008629,
  -0.024133935570716858,
  0.006512735038995743,
  -0.020198365673422813,
  0.02426088973879814,
  -0.014739347621798515,
  0.016427835449576378,
  -0.01647861674427986,
  -0.007204633671790361,
  -0.008080615662038326,
  0.004694120492786169,
  -0.002066174754872918,
  -0.014802824705839157,
  -0.005989050026983023,
  -0.0020868047140538692,
  -0.02301674149930477,
  0.019855590537190437,
  -0.031535349786281586,
  0.012860430404543877,
  0.011622629128396511,
  -0.008518606424331665,
  -0.009477108716964722,
  -0.001813853858038783,
  -0.027422042563557625,
  -0.008264699019491673,
  0.002078870078548789,
  0.024006983265280724,
  -0.008734428323805332,
  0.023499166592

In [11]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

## Load the document 

loader = TextLoader("./test.txt")
doc = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
)
chunks = text_splitter.split_documents(doc)

embeddings_model = OpenAIEmbeddings(api_key=api_key)
embeddings = embeddings_model.embed_documents(
    [chunk.page_content for chunk in chunks]
)

In [17]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_postgres.vectorstores import PGVector
from langchain_core.documents import Document
import uuid

raw_documents = TextLoader('./test.txt').load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, 
    chunk_overlap=200)
documents = text_splitter.split_documents(raw_documents)

# embed each chunk and insert it into the vector store
embeddings_model = OpenAIEmbeddings(api_key=api_key)
connection = 'postgresql+psycopg://langchain:langchain@localhost:6024/langchain'
db = PGVector.from_documents(documents, embeddings_model, connection=connection)

In [18]:
ids = [str(uuid.uuid4()), str(uuid.uuid4())]
db.add_documents(
    [
        Document(
            page_content="there are cats in the pond",
            metadata={"location": "pond", "topic": "animals"},
        ),
        Document(
            page_content="ducks are also found in the pond",
            metadata={"location": "pond", "topic": "animals"},
        ),
    ],
    ids=ids,
)

['eb24ce52-1e6d-48ca-85a1-3675c5597e2a',
 '42e0f3bc-9d51-461d-9cf9-deee9ea8ee7f']

In [23]:
db.similarity_search("jesus", k=2)


[Document(id='569d5da9-ce96-41c6-9ddc-93dde697d512', metadata={'source': './test.txt'}, page_content='Matthew 5:13-16 (Salt and Light)\n\n13 "You are the salt of the earth. But if the salt loses its saltiness, how can it be made salty again? It is no longer good for anything, except to be thrown out and trampled by men. 14 You are the light of the world. A town built on a hill cannot be hidden. 15 Neither do people light a lamp and put it under a bowl. Instead, they put it on its stand, and it gives light to everyone in the house. 16 In the same way, let your light shine before others, that they may see your good deeds and glorify your Father in heaven."'),
 Document(id='c3a7827b-2b14-466a-bd9e-eb53ccab6b9d', metadata={'source': './test.txt'}, page_content='Matthew 5:13-16 (Salt and Light)\n\n13 "You are the salt of the earth. But if the salt loses its saltiness, how can it be made salty again? It is no longer good for anything, except to be thrown out and trampled by men. 14 You are t

In [24]:
from langchain.indexes import SQLRecordManager, index
from langchain_postgres.vectorstores import PGVector
from langchain_openai import OpenAIEmbeddings
from langchain.docstore.document import Document

connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"
collection_name = "my_docs"
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=api_key)
namespace = "my_docs_namespace"

vectorstore = PGVector(
    embeddings=embeddings_model,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

record_manager = SQLRecordManager(
    namespace,
    db_url="postgresql+psycopg://langchain:langchain@localhost:6024/langchain",
)

record_manager.create_schema()

In [26]:
docs = [
    Document(page_content='there are cats in the pond', metadata={
        "id": 1, "source": "cats.txt"}),
    Document(page_content='ducks are also found in the pond', metadata={
        "id": 2, "source": "ducks.txt"}),
]

index_1 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",  # prevent duplicate documents
    source_id_key="source",  # use the source field as the source_id
)

print("Index attempt 1:", index_1)


Index attempt 1: {'num_added': 2, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}


In [27]:
index_2 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

print("Index attempt 2:", index_2)


Index attempt 2: {'num_added': 0, 'num_updated': 0, 'num_skipped': 2, 'num_deleted': 0}


In [28]:
docs[0].page_content = "I just modified this document!"
	
index_3 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)
	
print("Index attempt 3:", index_3)

Index attempt 3: {'num_added': 1, 'num_updated': 0, 'num_skipped': 1, 'num_deleted': 1}


## Strategies