# Code documentation Q&A bot example with LangChain

This Q&A bot will allow you to query your own documentation easily using questions. We'll also demonstrate the use of LangChain and LanceDB.

In [40]:
!pip install --quiet openai langchain
!pip install --quiet -U lancedb

In [41]:
import os
import lancedb
import re
import pickle
from pathlib import Path

In [42]:
import openai

os.environ['OPENAI_API_KEY'] = "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"

In [43]:
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import LanceDB
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

In [44]:
def get_document_title(document):
    m = str(document.metadata["source"])
    title = re.findall("pandas.documentation(.*).html", m)
    if title[0] is not None:
        return(title[0])
    return ''

In [45]:
docs_path = Path("docs.pkl")
docs = []

if not docs_path.exists():
    for p in Path("./pandas.documentation").rglob("*.html"):
        if p.is_dir():
            continue
        loader = UnstructuredHTMLLoader(p)
        raw_document = loader.load()
        
        m = {}
        m["title"] = get_document_title(raw_document[0])
        m["version"] = "2.0rc0"
        raw_document[0].metadata = raw_document[0].metadata | m
        raw_document[0].metadata["source"] = str(raw_document[0].metadata["source"])
        docs = docs + raw_document

    with docs_path.open("wb") as fh:
        pickle.dump(docs, fh)
else:
    with docs_path.open("rb") as fh:
        docs = pickle.load(fh)

In [46]:
docs[0]

Document(page_content='The page has been moved to Merge, join, concatenate and compare', metadata={'source': 'pandas.documentation/merging.html', 'title': '/merging', 'version': '2.0rc0'})

In [47]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
documents = text_splitter.split_documents(docs)
embeddings = OpenAIEmbeddings()

In [48]:
db = lancedb.connect('/tmp/lancedb')
table = db.create_table("pandas_docs", data=[
    {"vector": embeddings.embed_query("Hello World"), "text": "Hello World", "id": "1"}
], mode="overwrite")
docsearch = LanceDB.from_documents(documents, embeddings, connection=table)

In [38]:
query = "What's the current version of pandas?"
result = docsearch.similarity_search(query)
result[0]

Document(page_content='The page has been moved to Extending pandas', metadata={'vector': array([ 0.00032758,  0.00718955,  0.00039428, ...,  0.01052455,
       -0.01477229, -0.035358  ], dtype=float32), 'id': '2d82fc26-ac12-4e4e-93eb-4182f1de73a0', 'score': 0.2979205846786499})

In [None]:
query = "What are the major differences in pandas 2.0?"
result = docsearch.similarity_search(query)
result[0]

In [49]:
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever())

In [50]:
query = "What are the major differences in pandas 2.0?"
qa.run(query)

' The major differences in pandas 2.0 include installing optional dependencies with pip extras, the ability to use any numpy numeric dtype in an Index, and enhancements, notable bug fixes, backwards incompatible API changes, deprecations, and performance improvements.'

In [51]:
query = "What's the current version of pandas?"
qa.run(query)

' 2.0.0rc0'

In [52]:
query = "How do I make use of installing optional dependencies?"
qa.run(query)

' Optional dependencies can be installed with pip install "pandas[all]" or "pandas[performance]". This will install all recommended performance dependencies such as numexpr, bottleneck and numba.'

In [53]:
query = "What are the backwards incompatible API changes in Pandas 2.0?"
qa.run(query)

" \n\nPandas 2.0 includes a number of API breaking changes, such as increased minimum versions for dependencies, the use of os.linesep for DataFrame.to_csv's line_terminator, and reorganization of the library. See the release notes for a full list of changes."