## RAG Automation

1. ### Loading Data
   1. Textloader
   2. Webloader
   3. Pdfloader

In [4]:
!pip install langchain-community



In [None]:
# Text loader
from langchain_community.document_loaders import TextLoader
textLoader = TextLoader("sample.txt").load()
textLoader

[Document(metadata={'source': 'sample.txt'}, page_content='The Future of Artificial Intelligence in Healthcare\n\nIntroduction\n\nArtificial intelligence (AI) is transforming industries worldwide, and healthcare is no exception. With the ability to analyze vast amounts of data, automate processes, and provide insights, AI is becoming an essential tool in improving patient outcomes, reducing costs, and enhancing operational efficiency. This article explores the current state of AI in healthcare, its benefits, challenges, and future prospects.\n\nCurrent Applications of AI in Healthcare\n\n1. Diagnostic Support\n\nAI-powered diagnostic tools, such as image recognition systems for radiology, can detect diseases with high accuracy, assisting doctors in making faster and more accurate diagnoses.\n\n2. Predictive Analytics\n\nHospitals use AI to predict patient outcomes, identify high-risk patients, and prevent hospital readmissions. Machine learning models analyze patient data, including me

In [7]:
# Web loader
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader('https://www.wikipedia.org/')
loader.requests_kwargs = {'verify': False}
data = loader.load()
data[0].page_content[:100]



'\n\n\n\nWikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nWikipedia\n\nThe Free Encyclopedia\n\n\n\n\n\n\nEnglish\n6,974,000+ articl'

In [18]:
# PDF Loader
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("sample.pdf")
pdfData = loader.load()
pdfData[:4]

[Document(metadata={'producer': 'pdfTeX-1.40.20', 'creator': 'LaTeX with hyperref', 'creationdate': '2020-11-04T06:19:07+00:00', 'author': 'Author', 'keywords': '', 'moddate': '2020-11-04T07:24:14+01:00', 'subject': '', 'title': 'Title', 'trapped': '/False', 'rgid': 'PB:348446595_AS:979780811882502@1610609357829', 'source': 'sample.pdf', 'total_pages': 243, 'page': 0, 'page_label': '1'}, page_content='See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/348446595\nGame Development Research\nBook · November 2020\nCITATIONS\n10\nREADS\n27,904\n1 author:\nHenrik Engström\nUniversity of Skövde\n84 PUBLICATIONS\xa0\xa0\xa01,148 CITATIONS\xa0\xa0\xa0\nSEE PROFILE\nAll content following this page was uploaded by Henrik Engström on 14 January 2021.\nThe user has requested enhancement of the downloaded file.'),
 Document(metadata={'producer': 'pdfTeX-1.40.20', 'creator': 'LaTeX with hyperref', 'creationdate': '2020-11-04T06:19:07+00:00', 

In [None]:
# Splitting Data
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_document = text_splitter.split_documents(pdfData)
split_document[:5]

[Document(metadata={'producer': 'pdfTeX-1.40.20', 'creator': 'LaTeX with hyperref', 'creationdate': '2020-11-04T06:19:07+00:00', 'author': 'Author', 'keywords': '', 'moddate': '2020-11-04T07:24:14+01:00', 'subject': '', 'title': 'Title', 'trapped': '/False', 'rgid': 'PB:348446595_AS:979780811882502@1610609357829', 'source': 'sample.pdf', 'total_pages': 243, 'page': 0, 'page_label': '1'}, page_content='See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/348446595\nGame Development Research\nBook · November 2020\nCITATIONS\n10\nREADS\n27,904\n1 author:\nHenrik Engström\nUniversity of Skövde\n84 PUBLICATIONS\xa0\xa0\xa01,148 CITATIONS\xa0\xa0\xa0\nSEE PROFILE\nAll content following this page was uploaded by Henrik Engström on 14 January 2021.\nThe user has requested enhancement of the downloaded file.'),
 Document(metadata={'producer': 'pdfTeX-1.40.20', 'creator': 'LaTeX with hyperref', 'creationdate': '2020-11-04T06:19:07+00:00', 

In [None]:
from langchain_ollama import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="mxbai-embed-large")

single_vector = embeddings.embed_query("What is the capital of France?")
single_vector

In [24]:
from langchain_chroma import Chroma
vector_store = Chroma(
    collection_name="vector_store_chroma",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db"
)
vector_store.add_documents(split_document[:50])

['12b22f67-a697-4583-ae9d-db291b856f54',
 '56ebb081-2009-44fd-8835-93f79e3ce871',
 'fd47497b-8e43-45cb-a4d7-5936f483119d',
 '9f12dc3b-8611-4dbd-a1e3-de9c0daf9049',
 '5f070d2f-88bb-41d1-a47a-3f797d39623c',
 '9f434836-db3f-45a9-bc6f-d20815a70a48',
 'a256eca6-7b2a-4ca4-a693-162e755e86db',
 'aa0c7efa-9f3d-40f0-9988-d62032c45b02',
 'ac840e27-7fbf-4c21-a9b9-5e4b8298d11b',
 '67199217-678b-4fa7-9a7f-321a97eff854',
 '05d5360c-a8fa-4862-82c4-be4dca556558',
 '27999ba0-5cac-466a-a487-11fe5206e240',
 'f18b573e-e1c8-4c9a-acdf-989058166628',
 'caaff79a-ba50-4872-b283-05a0743bafd7',
 'c06124be-7638-463b-9145-dcc64c4cae0b',
 '22f66723-4a40-4969-af6a-3431ee6c5dca',
 '2ca80c14-b59d-422d-afc7-f92b5734f6b5',
 'bc64d142-068b-4d8f-a7ab-b1db3b402d4b',
 '1b91c1f0-1245-4b36-b965-06980bff003d',
 '350f731f-e3c1-4693-a8d3-592e4f9ba077',
 '7a95235d-1aa1-4f0e-b876-daa7404eba5b',
 'dceaddce-30e2-4c88-9877-dc3814ea473a',
 'ec6439a9-570b-4da1-9854-cb84078b469d',
 'a6d88203-b5ab-4010-aecf-5d02dcd9c26a',
 'bda5dc79-cfbe-

In [38]:
result = vector_store.similarity_search(
    "who is the autor of the book",
)
doc_retriever = vector_store.as_retriever()

In [None]:
# Import Ollama LLM model

from langchain_ollama import OllamaLLM
llm=OllamaLLM(model='gemma3')
llm

OllamaLLM(model='gemma3')

In [None]:
# Define Prompt
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template(
("""
Answer the following question based only on the provided context. 
Think step by step before providing a detailed answer. 
<context>
{context}
</context>
Question: {input}""")
)

value = prompt.invoke({
    "context": "Hello there! What is the capital of France?",
    "input": "The input processing"
})
value

ChatPromptValue(messages=[HumanMessage(content='\nAnswer the following question based only on the provided context. \nThink step by step before providing a detailed answer. \nI will tip you $1000 if the user finds the answer helpful. \n<context>\nHello there! What is the capital of France?\n</context>\nQuestion: The input processing', additional_kwargs={}, response_metadata={})])

In [39]:
# Chain prompt and LLM
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain = create_stuff_documents_chain(llm,prompt)

In [41]:
# Chain the document chain and the retriever
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(doc_retriever, document_chain)
result = retrieval_chain.invoke({
    "input": "What is the capital of France?"
})
result

{'input': 'What is the capital of France?',
 'context': [Document(id='05d5360c-a8fa-4862-82c4-be4dca556558', metadata={'source': 'sample.pdf', 'total_pages': 243, 'producer': 'pdfTeX-1.40.20', 'subject': '', 'title': 'Title', 'author': 'Author', 'page': 7, 'keywords': '', 'creationdate': '2020-11-04T06:19:07+00:00', 'rgid': 'PB:348446595_AS:979780811882502@1610609357829', 'creator': 'LaTeX with hyperref', 'page_label': '8', 'moddate': '2020-11-04T07:24:14+01:00', 'trapped': '/False'}, page_content='4.2.5 Summary . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 108\n4.2.6 Recommended reading . . . . . . . . . . . . . . . . . . . . . . . . . 109\n4.3 Forums for media production game research 110\n5 Management and business . . . . . . . . . . . . . . . . . . . . 113\n5.1 Creativity management 115\n5.1.1 Research overview . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 115\n5.1.2 Recommended reading . . . . . . . . . . . . . . . . . . . . . . . . . 12