In [1]:
from dotenv import load_dotenv
load_dotenv()

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain.chains import create_retrieval_chain


USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:

# Retrieve Data
def get_docs():
    loader = WebBaseLoader('https://en.wikipedia.org/wiki/Rurouni_Kenshin')
    docs = loader.load()

    # we chunk the document to smaller blocks and set some overlapping.
    # set overlap so we the cut off of each segment of doc is smoother, this won't affect LLM's capability to read the doc
    # but need to be mindful on the number of token can the LLM embeds (different for each model)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=20
    )

    splitDocs = text_splitter.split_documents(docs)

    return splitDocs

# vector store is a db store a documentation with the key as tokenized vector and value as the string of the documentation
def create_vector_store(docs):
    embedding = OpenAIEmbeddings()
    vectorStore = FAISS.from_documents(docs, embedding=embedding)
    return vectorStore


def create_chain(vectorStore):
    model = ChatOpenAI(
        temperature=0.4,
        model='gpt-3.5-turbo-1106'
    )

    prompt = ChatPromptTemplate.from_template("""
    Answer the user's question.
    Context: {context}
    Question: {input}
    """)

    # chain = prompt | model
    document_chain = create_stuff_documents_chain(
        llm=model,
        prompt=prompt
    )
    # this retriever can find the most relavent doc blocks to the question from the vectorDataStore
    # and put into {context} of the prompt (this is an agreed upon keyword
    retriever = vectorStore.as_retriever(search_kwargs={"k":3})
    # this retriever chain glues our main chain and the retriever together
    retrieval_chain = create_retrieval_chain(retriever, document_chain)
    return retrieval_chain

In [3]:
docs = get_docs()
print(docs[0:3])

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/Rurouni_Kenshin', 'title': 'Rurouni Kenshin - Wikipedia', 'language': 'en'}, page_content='Rurouni Kenshin - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation'), Document(metadata={'source': 'https://en.wikipedia.org/wiki/Rurouni_Kenshin', 'title': 'Rurouni Kenshin - Wikipedia', 'language': 'en'}, page_content='Navigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload file\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch'), Document(metadata={'source': 'https://en.wikipedia.org/wiki/Rurouni_Kenshin', 'title': 'Rurouni Kenshin - Wikipedia', 'language': 'en'}, page_content='Search\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\n\n\n\n\n\n\n\nAppearance\

In [4]:
vectorStore = create_vector_store(docs)
chain = create_chain(vectorStore)

response = chain.invoke({
    "input": "Which magazine was Rurouni Kenshin been published ?",
})

print(response)

{'input': 'Which magazine was Rurouni Kenshin been published ?', 'context': [Document(metadata={'source': 'https://en.wikipedia.org/wiki/Rurouni_Kenshin', 'title': 'Rurouni Kenshin - Wikipedia', 'language': 'en'}, page_content="Written and illustrated by Nobuhiro Watsuki, Rurouni Kenshin was serialized in Shueisha's shōnen manga magazine Weekly Shōnen Jump from April 12, 1994,[b] to September 21, 1999.[c] The 255 individual"), Document(metadata={'source': 'https://en.wikipedia.org/wiki/Rurouni_Kenshin', 'title': 'Rurouni Kenshin - Wikipedia', 'language': 'en'}, page_content='Two encyclopedias of the Rurouni Kenshin manga were released; the first one, Rurouni Kenshin Profiles (原典), was released first in Japan on July 4, 1996, by Shueisha and in the United States by Viz'), Document(metadata={'source': 'https://en.wikipedia.org/wiki/Rurouni_Kenshin', 'title': 'Rurouni Kenshin - Wikipedia', 'language': 'en'}, page_content='Further reading[edit]\n"Rurouni Kenshin, vol. 1". Voice of Youth Ad