# Settings

In [None]:
import pathlib

dir_data = pathlib.Path("../data")


In [None]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())


# ETL

In [None]:
from hlm12rag.etl import etl_data_from_kaggle

etl_data_from_kaggle(dataset="rtatman/questionanswer-dataset", dst=dir_data)


# Data Loading

In [1]:
import pathlib

dir_data = pathlib.Path("../data_sample")


In [2]:
from langchain.document_loaders import DirectoryLoader

document_loader = DirectoryLoader(dir_data, show_progress=True)


In [3]:
documents = document_loader.load()
(len(documents), documents[:10])


100%|██████████| 3/3 [00:05<00:00,  1.76s/it]


(3,
 [Document(page_content='kangaroo\n\nA kangaroo is a marsupial from the family Macropodidae (macropods, meaning \'large foot\'). In common use the term is used to describe the largest species from this family, the Red Kangaroo, the Antilopine Kangaroo, and the Eastern and Western Grey Kangaroo of the Macropus genus. The family also includes many smaller species which include the wallabies, tree-kangaroos, wallaroos, pademelons and the Quokka, some 63 living species in all. Kangaroos are endemic to the continent of Australia, while the smaller macropods are found in Australia and New Guinea.\n\nIn general, larger kangaroos have adapted much better to changes wrought to the Australian landscape by humans and though many of their smaller cousins are endangered, they are plentiful. They are not farmed to any extent, but wild kangaroos are shot for meat, over which there is controversy. Steve Dow: "An industry that\'s under the gun". Sydney Morning Herald online, September 26, 2007.\n\n

# Data Chunking

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=5)
document_chunks = text_splitter.split_documents(documents)
(len(document_chunks), document_chunks[:10])


(1427,
 [Document(page_content='kangaroo', metadata={'source': '../data_sample/S08_set1_a1.txt'}),
  Document(page_content='A kangaroo is a marsupial from the family', metadata={'source': '../data_sample/S08_set1_a1.txt'}),
  Document(page_content="Macropodidae (macropods, meaning 'large foot').", metadata={'source': '../data_sample/S08_set1_a1.txt'}),
  Document(page_content='In common use the term is used to describe the', metadata={'source': '../data_sample/S08_set1_a1.txt'}),
  Document(page_content='the largest species from this family, the Red', metadata={'source': '../data_sample/S08_set1_a1.txt'}),
  Document(page_content='Red Kangaroo, the Antilopine Kangaroo, and the', metadata={'source': '../data_sample/S08_set1_a1.txt'}),
  Document(page_content='the Eastern and Western Grey Kangaroo of the', metadata={'source': '../data_sample/S08_set1_a1.txt'}),
  Document(page_content='the Macropus genus. The family also includes many', metadata={'source': '../data_sample/S08_set1_a1.txt

## Vector Store

In [5]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.docarray import DocArrayInMemorySearch

embeddings = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
vector_store = DocArrayInMemorySearch.from_documents(document_chunks, embeddings)
vector_store


  from .autonotebook import tqdm as notebook_tqdm


<langchain.vectorstores.docarray.in_memory.DocArrayInMemorySearch at 0x14105e890>

# LLM

In [23]:
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    task="text2text-generation",
    model_id="google/flan-t5-small",
    model_kwargs=dict(temperature=0.01, max_length=128, do_sample=True),
)
llm


HuggingFacePipeline(pipeline=<transformers.pipelines.text2text_generation.Text2TextGenerationPipeline object at 0x14216f490>, model_id='google/flan-t5-small', model_kwargs={'temperature': 0.01, 'max_length': 128, 'do_sample': True}, pipeline_kwargs={})

# QA Chain

In [24]:
from langchain import hub

qa_rag_prompt = hub.pull("rlm/rag-prompt")
qa_rag_prompt


ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [25]:
from langchain.chains import RetrievalQA


qa = RetrievalQA.from_chain_type(
    llm,
    retriever=vector_store.as_retriever(),
    chain_type_kwargs={"prompt": qa_rag_prompt},
)
qa


RetrievalQA(combine_documents_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))]), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text2text_generation.Text2TextGenerationPipeline object at 0x14216f490>, model_id='google/flan-t5-small', model_kwargs={'temperature': 0.01, 'max_length': 128, 'do_sample': True}, pipeline_kwargs={})), document_variable_name='context'), retriever=VectorStoreRetriever(tags=['DocArrayInMemorySearch'], vectorstore=<langchain.vectorstores.docarray.in_memory.DocArrayInMemorySearch object at 0x141

# Question Answering

In [26]:
def ask(question) -> str:
    result = qa({"query": question})
    return result["result"]


In [27]:
ask("What animal represents australia?")




'kangaroo'

In [28]:
ask("what's the origin of the name kangaroo?")


'Guugu grey kangaroo'

In [29]:
ask("what are the cousings of the panther in the animal kingdom, and where are they from?")


'lion'