# 1. RAG built with LlamaIndex

In [None]:
import os
os.environ["LANGCHAIN_TELEMETRY"] = "false"

from dotenv import load_dotenv
load_dotenv()

from langchain.chains import RetrievalQA

In [None]:
from llama_index.core import VectorStoreIndex, Document, Settings, SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.text_splitter import TokenTextSplitter, SentenceSplitter
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

###########
# Install: pip install llama-index-llms-huggingface
from llama_index.legacy.llms.huggingface import HuggingFaceLLM

# Load model + tokenizer manually (T5 is seq2seq)
model_name = "google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


Settings.tokenizer = tokenizer

Settings.llm = HuggingFaceLLM(
    model=model,
    tokenizer=tokenizer,
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.1},
)


# Difference is of 's' in the ending of HuggingFaceEmbeddings and HuggingFaceEmbedding
# Settings.embed_model =  HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") # this Class is for LangChain embeddings
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2") # this Class is for LlamaIndex

## Method 2: PDF documents -> LlamaIndex
documents2 = SimpleDirectoryReader("docs/").load_data()
# print(f"Loaded {len(documents2)} documents")

# Create index with text splitter
# text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=100)
index = VectorStoreIndex.from_documents(
    documents2,
)


In [None]:
# LangChain RAG + LlamaIndex
# also query LlamaIndex
query = input("\nEnter your question (or 'exit' to quit): ")

query_engine = index.as_query_engine()
response = query_engine.query(query)
print("\nAnswer (LlamaIndex):")
print(response)

In [2]:
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama


In [9]:
documents = SimpleDirectoryReader("docs/").load_data()
index = VectorStoreIndex.from_documents(
    documents,
)



In [8]:
## Method 1: Using Ollama
# Settings.llm = Ollama(
#     model= "tinyllama",
#     temperature=0.1,
#     request_timeout=360.0,
# )

## Method 2: Using HuggingFace open source
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# from llama_index.legacy.llms.huggingface import HuggingFaceLLM
from llama_index.llms.huggingface import HuggingFaceLLM

model_name = "google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Settings.llm = HuggingFaceLLM(
    model=model,
    tokenizer=tokenizer,
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.1},
)

# set the embed model
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5",
    embed_batch_size=2,
)


2025-12-07 02:15:12,145 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
2025-12-07 02:15:14,723 - INFO - 1 prompt is loaded, with the key: query


## Response with Ollama model

In [4]:
query_engine = index.as_query_engine()
query_engine.query("What is weather")

2025-12-07 02:04:19,903 - INFO - HTTP Request: POST http://localhost:11434/api/show "HTTP/1.1 200 OK"
2025-12-07 02:05:33,842 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


Response(response='The weather in the given context is described as "weather" and not "prior knowledge". The phrase "not prior knowledge" means that the context does not include any information about the weather beforehand, which is why the answer is "weather".', source_nodes=[NodeWithScore(node=TextNode(id_='d9540249-9cfd-409c-a624-0967f9096a5b', embedding=None, metadata={'page_label': '160', 'file_name': 'Db114952.pdf', 'file_path': '/Users/imbilalbutt/PycharmProjects/RAGwithFastAPI/docs/Db114952.pdf', 'file_type': 'application/pdf', 'file_size': 6061722, 'creation_date': '2025-08-16', 'last_modified_date': '2025-08-16'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='954da0e5-2bdd-4d30-884d-b8059e94ace3

## Response with HuggingFace model

In [10]:
query_engine = index.as_query_engine()
query_engine.query("What is weather")

Token indices sequence length is longer than the specified maximum sequence length for this model (1234 > 512). Running this sequence through the model will result in indexing errors
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Response(response='Empty Response', source_nodes=[NodeWithScore(node=TextNode(id_='29c6909b-2750-46b4-9933-95eea79e008a', embedding=None, metadata={'page_label': '160', 'file_name': 'Db114952.pdf', 'file_path': '/Users/imbilalbutt/PycharmProjects/RAGwithFastAPI/docs/Db114952.pdf', 'file_type': 'application/pdf', 'file_size': 6061722, 'creation_date': '2025-08-16', 'last_modified_date': '2025-08-16'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='51ba17dd-9d43-46f9-9fe2-fe55d0317e72', node_type='4', metadata={'page_label': '160', 'file_name': 'Db114952.pdf', 'file_path': '/Users/imbilalbutt/PycharmProjects/RAGwithFastAPI/docs/Db114952.pdf', 'file_type': 'application/pdf', 'file_size': 6061722, 'creation_da

# 2. LlamaIndex with Chroma vector store