<h1>RAG System for CSV</h1>

In [1]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_ollama import ChatOllama
from langchain_ollama import OllamaEmbeddings
from langchain.schema import Document
from langchain_community.vectorstores.chroma import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import OllamaEmbeddings
import pandas as pd


<h2>Import Data</h2>

The data was acqired from https://demo.oshinit.com/pages/mortgage-calculator.html. 300000 for amount, 360 months and Interest rate of 5%.

In [2]:
file_path = './data/mortgage-300h-360m-5r.csv'

data = pd.read_csv(file_path, encoding='utf-8')

data.head()

Unnamed: 0,Period,Monthly Payment,Computed Interest Due,Principal Due,Principal Balance
0,0,0.0,0.0,0.0,300000.0
1,1,1610.464869,1250.0,360.464869,299639.535131
2,2,1610.464869,1248.498063,361.966806,299277.568325
3,3,1610.464869,1246.989868,363.475001,298914.093324
4,4,1610.464869,1245.475389,364.98948,298549.103844


load and process data

In [None]:
loader = CSVLoader(
    file_path=file_path,
    csv_args={
        "delimiter": ",",
        "quotechar": '"',
        "fieldnames": ["Period","Monthly Payment","Computed Interest Due","Principal Due","Principal Balance"],
    },
    encoding="windows-1252" 
)

try:
    docs = loader.load()
except Exception as e:
    print("Error during data loading:", e)

print(docs)
# text_splitter = RecursiveCharacterTextSplitter(
#     # Set a really small chunk size, just to show.
#     chunk_size=100,
#     chunk_overlap=20,
#     length_function=len,
#     is_separator_regex=False,
# )

# chunks = text_splitter.split_documents(docs)
# print(chunks[0])

# for record in docs[:2]:
#     print(record)
# documents =  [Document(page_content=text) for text in docs]
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
# chunks = text_splitter.split_documents(documents)



[Document(metadata={'source': './data/mortgage-300h-360m-5r.csv', 'row': 0}, page_content='Period: Period\nMonthly Payment: Monthly Payment\nComputed Interest Due: Computed Interest Due\nPrincipal Due: Principal Due\nPrincipal Balance: Principal Balance'), Document(metadata={'source': './data/mortgage-300h-360m-5r.csv', 'row': 1}, page_content='Period: 0\nMonthly Payment: 0\nComputed Interest Due: 0\nPrincipal Due: 0\nPrincipal Balance: 300000'), Document(metadata={'source': './data/mortgage-300h-360m-5r.csv', 'row': 2}, page_content='Period: 1\nMonthly Payment: 1610.4648690364193\nComputed Interest Due: 1250\nPrincipal Due: 360.46486903641926\nPrincipal Balance: 299639.5351309636'), Document(metadata={'source': './data/mortgage-300h-360m-5r.csv', 'row': 3}, page_content='Period: 2\nMonthly Payment: 1610.4648690364193\nComputed Interest Due: 1248.4980630456703\nPrincipal Due: 361.966805990749\nPrincipal Balance: 299277.5683249729'), Document(metadata={'source': './data/mortgage-300h-36

: 

In [None]:
# Create vector database
vector_db = Chroma.from_documents(
    documents=docs, embedding=OllamaEmbeddings(model="mistral:latest")
)

  documents=docs, embedding=OllamaEmbeddings(model="mistral:latest")


In [None]:
# LLM from Ollama
local_model = "mistral:latest"
llm = ChatOllama(model=local_model)

In [None]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five question-answering variations of the given user question to retrieve relevant documents from a vector database. By framing the query as potential answers to a question, your goal is to identify documents that directly address the user's information need. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [None]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [None]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
chain.invoke("What is the total monthly payment?")

In [None]:
# loader = DirectoryLoader('docs2', glob='**/*.htm', loader_cls=BSHTMLLoader, loader_kwargs={'open_encoding': 'utf8'}, show_progress=True, use_multithreading=True)

# docs = loader.load()

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
# splits = text_splitter.split_documents(docs)

# model = 'Alibaba-NLP/gte-large-en-v1.5'
# model_kwargs = model_kwargs = {'device':'cpu', 'trust_remote_code': True}
# encode_kwargs = {'normalize_embeddings': True}
# embeddings = HuggingFaceEmbeddings(
#     model_name=model,
#     model_kwargs=model_kwargs,
#     encode_kwargs=encode_kwargs
# )

# vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)