# Settings

In [None]:
import pathlib

dir_data = pathlib.Path("../data")


In [None]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())


# ETL

In [None]:
from hlm12rag.etl import etl_data_from_kaggle

etl_data_from_kaggle(dataset="rtatman/questionanswer-dataset", dst=dir_data)


# Data Loading

In [1]:
import pathlib

dir_data = pathlib.Path("../data_sample")


In [2]:
from langchain.document_loaders import DirectoryLoader

document_loader = DirectoryLoader(dir_data, show_progress=True)


In [3]:
documents = document_loader.load()
(len(documents), documents[:10])


100%|██████████| 3/3 [00:04<00:00,  1.66s/it]


(3,
 [Document(page_content='the dry bog\n\nalongside the shire of ag45i4nt there is a river that flows into a bog. all the water of the large river flows into it, but the bog is dry. not a single soul understands why. but the bog near the ag45i4nt shire is completely dry.', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='the 4831asx eye less\n\nnobody on the street would defy johnathan ferg-simons anymore. he is wearing his new eye glasses, the 4831asx. he can see through walls and shoot lasers. everybody is afraid of mr ferg-simons.', metadata={'source': '../data_sample/cyber-punk.txt'}),
  Document(page_content="the last kingdom\n\narguslweruna is the king here, but he doesn't care about people. he does as he pleases, drunk all day, barely ever listen to what needs to be done. however, in a moment of need, he put his life on the line and fought for everyone. and this is why people love him.", metadata={'source': '../data_sample/the-last-kingdom.txt'}

# Data Chunking

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=5)
document_chunks = text_splitter.split_documents(documents)
(len(document_chunks), document_chunks[:10])


(19,
 [Document(page_content='the dry bog', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='alongside the shire of ag45i4nt there is a river', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='that flows into a bog. all the water of the large', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='river flows into it, but the bog is dry. not a', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='a single soul understands why. but the bog near', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='near the ag45i4nt shire is completely dry.', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='the 4831asx eye less', metadata={'source': '../data_sample/cyber-punk.txt'}),
  Document(page_content='nobody on the street would defy johnathan', metadata={'source': '../data_sample/cyber-punk.txt'}),
  Document(page_conte

## Vector Store

In [5]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.docarray import DocArrayInMemorySearch

embeddings = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
vector_store = DocArrayInMemorySearch.from_documents(document_chunks, embeddings)
vector_store


  from .autonotebook import tqdm as notebook_tqdm


<langchain.vectorstores.docarray.in_memory.DocArrayInMemorySearch at 0x13e941060>

# LLM

In [6]:
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    task="text2text-generation",
    model_id="google/flan-t5-small",
    model_kwargs=dict(temperature=0.01, max_length=128, do_sample=True),
)
llm


HuggingFacePipeline(pipeline=<transformers.pipelines.text2text_generation.Text2TextGenerationPipeline object at 0x12e995a50>, model_id='google/flan-t5-small', model_kwargs={'temperature': 0.01, 'max_length': 128, 'do_sample': True}, pipeline_kwargs={})

# QA Chain

In [7]:
from langchain import hub

qa_rag_prompt = hub.pull("rlm/rag-prompt")
qa_rag_prompt


ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [8]:
from langchain.chains import RetrievalQA


qa = RetrievalQA.from_chain_type(
    llm,
    retriever=vector_store.as_retriever(),
    chain_type_kwargs={"prompt": qa_rag_prompt},
    return_source_documents=True,
)
qa


RetrievalQA(combine_documents_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))]), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text2text_generation.Text2TextGenerationPipeline object at 0x12e995a50>, model_id='google/flan-t5-small', model_kwargs={'temperature': 0.01, 'max_length': 128, 'do_sample': True}, pipeline_kwargs={})), document_variable_name='context'), return_source_documents=True, retriever=VectorStoreRetriever(tags=['DocArrayInMemorySearch'], vectorstore=<langchain.vectorstores.docarray.in_memory.DocArray

# Question Answering

In [9]:
def ask(question) -> str:
    result = qa({"query": question})
    return result["result"], result["source_documents"]


In [10]:
ask("Who loves arguslweruna?")




('people',
 [Document(page_content="arguslweruna is the king here, but he doesn't", metadata={'source': '../data_sample/the-last-kingdom.txt'}),
  Document(page_content='and this is why people love him.', metadata={'source': '../data_sample/the-last-kingdom.txt'}),
  Document(page_content='his life on the line and fought for everyone. and', metadata={'source': '../data_sample/the-last-kingdom.txt'}),
  Document(page_content='care about people. he does as he pleases, drunk', metadata={'source': '../data_sample/the-last-kingdom.txt'})])

In [11]:
ask("What's arguslweruna role?")


('king',
 [Document(page_content="arguslweruna is the king here, but he doesn't", metadata={'source': '../data_sample/the-last-kingdom.txt'}),
  Document(page_content='his life on the line and fought for everyone. and', metadata={'source': '../data_sample/the-last-kingdom.txt'}),
  Document(page_content='and this is why people love him.', metadata={'source': '../data_sample/the-last-kingdom.txt'}),
  Document(page_content='care about people. he does as he pleases, drunk', metadata={'source': '../data_sample/the-last-kingdom.txt'})])

In [12]:
ask("What is the bog near ag45i4nt like?")


('dry',
 [Document(page_content='the dry bog', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='that flows into a bog. all the water of the large', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='river flows into it, but the bog is dry. not a', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='a single soul understands why. but the bog near', metadata={'source': '../data_sample/the-dry-bog.txt'})])

In [13]:
ask("What is 4831asx capable of?")


('shooting lasers',
 [Document(page_content='the 4831asx eye less', metadata={'source': '../data_sample/cyber-punk.txt'}),
  Document(page_content='eye glasses, the 4831asx. he can see through', metadata={'source': '../data_sample/cyber-punk.txt'}),
  Document(page_content='alongside the shire of ag45i4nt there is a river', metadata={'source': '../data_sample/the-dry-bog.txt'}),
  Document(page_content='walls and shoot lasers. everybody is afraid of mr', metadata={'source': '../data_sample/cyber-punk.txt'})])