# Quickstart

In [2]:
%pip install elasticsearch

Collecting elasticsearch
  Downloading elasticsearch-8.12.1-py3-none-any.whl.metadata (5.3 kB)
Collecting elastic-transport<9,>=8 (from elasticsearch)
  Downloading elastic_transport-8.12.0-py3-none-any.whl.metadata (3.5 kB)
Downloading elasticsearch-8.12.1-py3-none-any.whl (432 kB)
   ---------------------------------------- 0.0/432.1 kB ? eta -:--:--
   --- ----------------------------------- 41.0/432.1 kB 991.0 kB/s eta 0:00:01
   -------- ------------------------------- 92.2/432.1 kB 1.1 MB/s eta 0:00:01
   ---------- ----------------------------- 112.6/432.1 kB 1.1 MB/s eta 0:00:01
   ---------- ----------------------------- 112.6/432.1 kB 1.1 MB/s eta 0:00:01
   --------------------- ------------------ 235.5/432.1 kB 1.1 MB/s eta 0:00:01
   ---------------------------- ----------- 307.2/432.1 kB 1.1 MB/s eta 0:00:01
   --------------------------------- ------ 358.4/432.1 kB 1.1 MB/s eta 0:00:01
   -------------------------------------- - 419.8/432.1 kB 1.1 MB/s eta 0:00:01
   ---

In [27]:
from langchain.indexes import SQLRecordManager, index
from langchain.schema import Document
from langchain_community.vectorstores import ElasticsearchStore, Chroma, FAISS
from langchain_openai import OpenAIEmbeddings


collection_name = "test_index"

embedding = OpenAIEmbeddings()

vectorstore = Chroma("store", embedding)


In [28]:
namespace = f"chroma/{collection_name}"
record_manager = SQLRecordManager(
    namespace, db_url="sqlite:///record_manager_cache.sql"
)

In [30]:
record_manager.create_schema()

In [31]:
doc1 = Document(page_content="kitty", metadata={"source": "kitty.txt"})
doc2 = Document(page_content="doggy", metadata={"source": "doggy.txt"})

In [32]:
#vectorstore.add_documents([doc1])

# `None` deletion mode

In [33]:
def _clear():
    """Hacky helper method to clear content. See the `full` mode section to understand why it works."""
    index([], record_manager, vectorstore, cleanup="full", source_id_key="source")

In [34]:
_clear()

In [36]:
index(
    [doc1, doc1, doc1, doc1, doc1],
    record_manager,
    vectorstore,
    cleanup=None,
    source_id_key="source",
)

{'num_added': 1, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [37]:
_clear()

In [38]:
index([doc1, doc2], record_manager, vectorstore, cleanup=None, source_id_key="source")

{'num_added': 2, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [39]:
index([doc1, doc2], record_manager, vectorstore, cleanup=None, source_id_key="source")

{'num_added': 0, 'num_updated': 0, 'num_skipped': 2, 'num_deleted': 0}

# `incremental` deletion mode

In [40]:
_clear()

In [41]:
index(
    [doc1, doc2],
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 2, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [42]:
index(
    [doc1, doc2],
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 0, 'num_updated': 0, 'num_skipped': 2, 'num_deleted': 0}

In [43]:
index([], record_manager, vectorstore, cleanup="incremental", source_id_key="source")

{'num_added': 0, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [44]:
changed_doc_2 = Document(page_content="puppy", metadata={"source": "doggy.txt"})

In [45]:
index(
    [changed_doc_2],
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 1, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 1}

# `full` deletion mode

In [46]:
_clear()

In [53]:
all_docs = [doc1, doc2]

In [48]:
index(all_docs, record_manager, vectorstore, cleanup="full", source_id_key="source")

{'num_added': 2, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [54]:
all_docs.remove(doc1)
all_docs

[Document(page_content='doggy', metadata={'source': 'doggy.txt'})]

In [55]:
index(all_docs, record_manager, vectorstore, cleanup="full", source_id_key="source")

{'num_added': 0, 'num_updated': 0, 'num_skipped': 1, 'num_deleted': 1}

# Source

In [56]:
from langchain.text_splitter import CharacterTextSplitter

doc1 = Document(page_content="kitty kitty kitty kitty kitty", metadata={"source": "kitty.txt"})
doc2 = Document(page_content="doggy doggy the doggy", metadata={"source": "doggy.txt"})

In [57]:
new_docs = CharacterTextSplitter(
    separator="t", keep_separator=True, chunk_size=12, chunk_overlap=2
).split_documents([doc1, doc2])

new_docs

[Document(page_content='kitty kit', metadata={'source': 'kitty.txt'}),
 Document(page_content='tty kitty ki', metadata={'source': 'kitty.txt'}),
 Document(page_content='tty kitty', metadata={'source': 'kitty.txt'}),
 Document(page_content='doggy doggy', metadata={'source': 'doggy.txt'}),
 Document(page_content='the doggy', metadata={'source': 'doggy.txt'})]

In [58]:
_clear()

In [59]:
index(
    new_docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 5, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [60]:
changed_doggy_docs = [
    Document(page_content="woof woof", metadata={"source": "doggy.txt"}),
    Document(page_content="woof woof woof", metadata={"source": "doggy.txt"}),
]

In [61]:
index(
    changed_doggy_docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)

{'num_added': 2, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 2}

In [63]:
vectorstore.similarity_search("dog", k=3)

[Document(page_content='woof woof', metadata={'source': 'doggy.txt'}),
 Document(page_content='woof woof woof', metadata={'source': 'doggy.txt'}),
 Document(page_content='tty kitty', metadata={'source': 'kitty.txt'})]

# Using with loaders

In [64]:
from langchain_community.document_loaders.base import BaseLoader


class MyCustomLoader(BaseLoader):
    def lazy_load(self):
        text_splitter = CharacterTextSplitter(
            separator="t", keep_separator=True, chunk_size=12, chunk_overlap=2
        )
        docs = [
            Document(page_content="woof woof", metadata={"source": "doggy.txt"}),
            Document(page_content="woof woof woof", metadata={"source": "doggy.txt"}),
        ]
        yield from text_splitter.split_documents(docs)

    def load(self):
        return list(self.lazy_load())

In [65]:
_clear()
loader = MyCustomLoader()

In [66]:
loader.load()

[Document(page_content='woof woof', metadata={'source': 'doggy.txt'}),
 Document(page_content='woof woof woof', metadata={'source': 'doggy.txt'})]

In [67]:
index(loader, record_manager, vectorstore, cleanup="full", source_id_key="source")

{'num_added': 2, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [68]:
vectorstore.similarity_search("dog", k=30)

Number of requested results 30 is greater than number of elements in index 2, updating n_results = 2


[Document(page_content='woof woof', metadata={'source': 'doggy.txt'}),
 Document(page_content='woof woof woof', metadata={'source': 'doggy.txt'})]