# Settings

In [None]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

# Data Loading

In [1]:
import pathlib

dir_data = pathlib.Path("../data/docs_sample")

In [2]:
from langchain.document_loaders import DirectoryLoader

document_loader = DirectoryLoader(dir_data, show_progress=True)

In [3]:
documents = document_loader.load()
(len(documents), documents[:10])

100%|██████████| 3/3 [00:06<00:00,  2.33s/it]


(3,
 [Document(page_content='the dry bog\n\nalongside the shire of ag45i4nt there is a river that flows into a bog. all the water of the large river flows into it, but the bog is dry. not a single soul understands why. but the bog near the ag45i4nt shire is completely dry.', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='the 4831asx eye glasses\n\nnobody on the street would defy johnathan ferg-simons anymore. he is wearing his new eye glasses, the 4831asx. with these glasses can see through walls and shoot lasers. everybody is afraid of mr ferg-simons.', metadata={'source': '../data_sample/cyber-punk.txt'}),
  Document(page_content="the last kingdom\n\narguslweruna is the king here, but he doesn't care about people. he does as he pleases, drunk all day, barely ever listen to what needs to be done. however, in a moment of need, he put his life on the line and fought for everyone. and this is why people love him.", metadata={'source': '../data_sample/the

# Data Chunking

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=5)
document_chunks = text_splitter.split_documents(documents)
(len(document_chunks), document_chunks[:10])

(19,
 [Document(page_content='the dry bog', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='alongside the shire of ag45i4nt there is a river', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='that flows into a bog. all the water of the large', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='river flows into it, but the bog is dry. not a', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='a single soul understands why. but the bog near', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='near the ag45i4nt shire is completely dry.', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='the 4831asx eye glasses', metadata={'source': '../data_sample/cyber-punk.txt'}),
  Document(page_content='nobody on the street would defy johnathan', metadata={'source': '../data_sample/cyber-punk.txt'}),
  Document(page_co

## Vector Store

In [6]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.redis import Redis

embeddings = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
vector_store = Redis.from_documents(document_chunks, embeddings)
vector_store

<langchain.vectorstores.redis.base.Redis at 0x1387cfeb0>

In [8]:
vector_store.similarity_search_with_score("What is the greatest ocean in the world?")

[(Document(page_content='that flows into a bog. all the water of the large', metadata={'id': 'doc:5a5cb33fb7744e12bdb8c5cb270ac99b:df74e62571054bccbc5a09e9d4a93271', 'source': '../data_sample/the-dry-bog.txt'}),
  0.6489),
 (Document(page_content='river flows into it, but the bog is dry. not a', metadata={'id': 'doc:5a5cb33fb7744e12bdb8c5cb270ac99b:00a0b8b81ef84dfdb63ff9201ea7f6ed', 'source': '../data_sample/the-dry-bog.txt'}),
  0.7548),
 (Document(page_content='the last kingdom', metadata={'id': 'doc:5a5cb33fb7744e12bdb8c5cb270ac99b:97db24d562134770a3694b1a47661cc5', 'source': '../data_sample/the-last-kingdom.txt'}),
  0.7736),
 (Document(page_content='alongside the shire of ag45i4nt there is a river', metadata={'id': 'doc:5a5cb33fb7744e12bdb8c5cb270ac99b:3e80f1307f0f473985e68ee89783197c', 'source': '../data_sample/the-dry-bog.txt'}),
  0.7801)]

In [9]:
vector_store.similarity_search_with_score("Where is the dry bog?")

[(Document(page_content='the dry bog', metadata={'id': 'doc:5a5cb33fb7744e12bdb8c5cb270ac99b:9f0a049baa6d4c0281790a14c0dabffb', 'source': '../data_sample/the-dry-bog.txt'}),
  0.0926),
 (Document(page_content='river flows into it, but the bog is dry. not a', metadata={'id': 'doc:5a5cb33fb7744e12bdb8c5cb270ac99b:00a0b8b81ef84dfdb63ff9201ea7f6ed', 'source': '../data_sample/the-dry-bog.txt'}),
  0.2632),
 (Document(page_content='that flows into a bog. all the water of the large', metadata={'id': 'doc:5a5cb33fb7744e12bdb8c5cb270ac99b:df74e62571054bccbc5a09e9d4a93271', 'source': '../data_sample/the-dry-bog.txt'}),
  0.3876),
 (Document(page_content='near the ag45i4nt shire is completely dry.', metadata={'id': 'doc:5a5cb33fb7744e12bdb8c5cb270ac99b:e90e3d30b5f645ed8f03065b2485eb2b', 'source': '../data_sample/the-dry-bog.txt'}),
  0.4868)]

# LLM

In [10]:
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    task="text2text-generation",
    model_id="google/flan-t5-small",
    model_kwargs=dict(temperature=0.01, max_length=128, do_sample=True),
)
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text2text_generation.Text2TextGenerationPipeline object at 0x1379e0c10>, model_id='google/flan-t5-small', model_kwargs={'temperature': 0.01, 'max_length': 128, 'do_sample': True}, pipeline_kwargs={})

# QA Chain

In [11]:
from langchain import hub

qa_rag_prompt = hub.pull("rlm/rag-prompt")
qa_rag_prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [12]:
from langchain.chains import RetrievalQA


qa = RetrievalQA.from_chain_type(
    llm,
    retriever=vector_store.as_retriever(
        search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5}
    ),
    chain_type_kwargs={"prompt": qa_rag_prompt},
    return_source_documents=True,
)
qa

RetrievalQA(combine_documents_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))]), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text2text_generation.Text2TextGenerationPipeline object at 0x1379e0c10>, model_id='google/flan-t5-small', model_kwargs={'temperature': 0.01, 'max_length': 128, 'do_sample': True}, pipeline_kwargs={})), document_variable_name='context'), return_source_documents=True, retriever=RedisVectorStoreRetriever(tags=['Redis', 'HuggingFaceEmbeddings'], vectorstore=<langchain.vectorstores.redis.base.Red

# Question Answering

In [13]:
def ask(question) -> str:
    result = qa({"query": question})
    return result["result"], result["source_documents"]

In [14]:
ask("Who loves arguslweruna?")



('king',
 [Document(page_content="arguslweruna is the king here, but he doesn't", metadata={'id': 'doc:5a5cb33fb7744e12bdb8c5cb270ac99b:e965cecb561e4a50bfe079704c320315', 'source': '../data_sample/the-last-kingdom.txt'})])

In [15]:
ask("What's arguslweruna role?")

('king',
 [Document(page_content="arguslweruna is the king here, but he doesn't", metadata={'id': 'doc:5a5cb33fb7744e12bdb8c5cb270ac99b:e965cecb561e4a50bfe079704c320315', 'source': '../data_sample/the-last-kingdom.txt'})])

In [16]:
ask("What is the bog near ag45i4nt like?")

('dry',
 [Document(page_content='the dry bog', metadata={'id': 'doc:5a5cb33fb7744e12bdb8c5cb270ac99b:9f0a049baa6d4c0281790a14c0dabffb', 'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='that flows into a bog. all the water of the large', metadata={'id': 'doc:5a5cb33fb7744e12bdb8c5cb270ac99b:df74e62571054bccbc5a09e9d4a93271', 'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='river flows into it, but the bog is dry. not a', metadata={'id': 'doc:5a5cb33fb7744e12bdb8c5cb270ac99b:00a0b8b81ef84dfdb63ff9201ea7f6ed', 'source': '../data_sample/the-dry-bog.txt'})])

In [17]:
ask("What is 4831asx capable of?")

('a pair of glasses',
 [Document(page_content='the 4831asx eye glasses', metadata={'id': 'doc:5a5cb33fb7744e12bdb8c5cb270ac99b:6270ed55e5fd4d119b70b30f333537f9', 'source': '../data_sample/cyber-punk.txt'}),
  Document(page_content='eye glasses, the 4831asx. with these glasses can', metadata={'id': 'doc:5a5cb33fb7744e12bdb8c5cb270ac99b:70e39318efda47e2bba868f7f6e586be', 'source': '../data_sample/cyber-punk.txt'})])