# Settings

In [None]:
import pathlib

dir_data = pathlib.Path("../data")


In [None]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())


# ETL

In [None]:
from hlm12rag.etl import etl_data_from_kaggle

etl_data_from_kaggle(dataset="rtatman/questionanswer-dataset", dst=dir_data)


# Data Loading

In [None]:
import pathlib

dir_data = pathlib.Path("../data_sample")


In [None]:
from langchain.document_loaders import DirectoryLoader

document_loader = DirectoryLoader(dir_data, show_progress=True)


In [None]:
documents = document_loader.load()
(len(documents), documents[:10])


# Data Chunking

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=5)
document_chunks = text_splitter.split_documents(documents)
(len(document_chunks), document_chunks[:10])


## Vector Store

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.docarray import DocArrayInMemorySearch

embeddings = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
vector_store = DocArrayInMemorySearch.from_documents(document_chunks, embeddings)
vector_store


In [None]:
vector_store.similarity_search_with_score("What is the greatest ocean in the world?")


In [None]:
vector_store.similarity_search_with_score("Where is the dry bog?")


# LLM

In [None]:
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    task="text2text-generation",
    model_id="google/flan-t5-small",
    model_kwargs=dict(temperature=0.01, max_length=128, do_sample=True),
)
llm


# QA Chain

In [None]:
from langchain import hub

qa_rag_prompt = hub.pull("rlm/rag-prompt")
qa_rag_prompt


In [None]:
from langchain.chains import RetrievalQA


qa = RetrievalQA.from_chain_type(
    llm,
    retriever=vector_store.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5}),
    chain_type_kwargs={"prompt": qa_rag_prompt},
    return_source_documents=True,
)
qa


# Question Answering

In [None]:
def ask(question) -> str:
    result = qa({"query": question})
    return result["result"], result["source_documents"]


In [None]:
ask("Who loves arguslweruna?")


In [None]:
ask("What's arguslweruna role?")


In [None]:
ask("What is the bog near ag45i4nt like?")


In [None]:
ask("What is 4831asx capable of?")
