In [None]:
#rag with the context only
from langchain_community.document_loaders import CSVLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain.vectorstores import Chroma
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings

from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

#adding history to rag
from langchain_core.messages import HumanMessage, AIMessage
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.prompts import MessagesPlaceholder
from langchain.chains import create_history_aware_retriever

from dotenv import load_dotenv
import os

# Load Documents

In [3]:
#load each file in dir
list_dir = os.listdir('data')
paths = [os.path.join('data',path) for path in list_dir]
docs_list = []

for url_path in paths :
    if url_path.endswith('csv') : 
        csv_loader = CSVLoader(url_path)
        docs_list += csv_loader.load()
    elif url_path.endswith('pdf') : 
        pdf_loader = PyPDFLoader(url_path)
        pdf_doc = pdf_loader.load()
        splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=200)
        pdf_doc_splitted = splitter.split_documents(pdf_doc)
        docs_list += pdf_doc_splitted

In [46]:
len(docs_list)

20887

# Retrieval

In [4]:
embedding_model = HuggingFaceEmbeddings(model='all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
chroma_db = Chroma.from_documents(docs_list, embedding_model, persist_directory='.chroma_index')

In [5]:
chroma_db = Chroma(persist_directory='.chroma_index/', embedding_function=embedding_model)

  chroma_db = Chroma(persist_directory='.chroma_index/', embedding_function=embedding_model)


# Context Integration

In [6]:
load_dotenv()
GROQ_API_KEY = os.getenv('GROQ_API_KEY')

In [7]:
llm_model = ChatGroq(model='meta-llama/llama-4-scout-17b-16e-instruct', api_key=GROQ_API_KEY)

In [9]:
#get context from retriever 
prompt = ChatPromptTemplate.from_messages([
    ('system', 'You are a financial analyst who helps users understand financial data and answer queries.'),
    ('human', 'Here is the data:\n{context}\n\nQuestion: {input}')
])

In [None]:
document_chain = create_stuff_documents_chain(llm_model, prompt)

In [8]:
retriver = chroma_db.as_retriever()

In [None]:
retriver_chain = create_retrieval_chain(retriver,document_chain)

NameError: name 'document_chain' is not defined

# LLM Response

In [None]:
result = retriver_chain.invoke({
    'input' : 'Give me insight about loan approvals for low income applicants.'
})  

In [13]:
print(result['answer'])

Based on the provided data, I'll analyze the loan approvals for low-income applicants.

**Definition of Low Income:**
For the purpose of this analysis, let's consider low-income applicants as those with an ApplicantIncome less than ₹4000.

**Low-Income Applicants:**
From the provided data, the following loan applications have an ApplicantIncome less than ₹4000:

1. LP001034: ApplicantIncome = ₹3596 (Loan Status: Y)
2. LP001770: ApplicantIncome = ₹3189 (Loan Status: Y)

**Insights:**

* Both low-income applicants (LP001034 and LP001770) have been approved for loans (Loan Status: Y).
* The approved loan amounts for these applicants are ₹100 and ₹120, respectively.
* Although the loan amounts are relatively small, it indicates that the lender is willing to approve loans for low-income applicants.
* The credit history of LP001770 is 1, which might be a factor in the loan approval. However, the credit history of LP001034 is missing, so we can't draw any conclusions from that.

**Additional 

In [65]:
result = retriver_chain.invoke({
    'input' : 'You just gave me the answer, please make it more simple text'
})

In [67]:
print(result['answer'])

**Comprehensive Financial Data Analysis**

The provided dataset consists of four loan applications with various financial attributes. This analysis aims to identify key patterns, correlations, and anomalies within the data.

### **Overview of Key Financial Metrics**

* **Loan Amount and Duration**: The loan amounts range from $20,592 to $45,282, with durations varying from 36 to 60 months.
* **Income and Debt**: Annual incomes range from $20,196 to $81,573, with monthly debt payments between $264 and $679.
* **Credit Scores and Risk Scores**: Credit scores range from 467 to 606, and risk scores range from 45 to 61.

### **Correlation Analysis**

* **Credit Score and Risk Score**: A higher credit score appears to be associated with a lower risk score, indicating a potential inverse relationship.
* **Income and Debt-to-Income Ratio**: There is no clear correlation between income and debt-to-income ratio, suggesting that higher incomes do not necessarily translate to lower debt burdens.
*

## RAG with History 
create prompt template with history (MessagesPlaceholder('chat_history')) -> create retriever history dengan menggabungkan retriever, llm dan history prompt (create_history_aware_retriever) -> prompt -> chain llm dengan prompt (create_stuff_docuement_chain) -> retrieval chain antara history retriever dan chain llm prompt (rag chain) -> untuk menggunakan chatbot simpan history kedalam list yang berisikan AIMessage dan HumanMessage

In [10]:
contextualized_system_prompt_template = """
You are a helpful assistant specialized in data analysis and answering questions about datasets.

Your job is to rephrase follow-up questions into fully standalone questions by using the previous chat history for context.

Only output the rephrased standalone question. Do not answer the question.
"""

In [11]:
contextualized_system_prompt = ChatPromptTemplate.from_messages([
    ('system', contextualized_system_prompt_template),
    (MessagesPlaceholder('chat_history')),
    ('human', "{input}")
])

In [12]:
retriever_aware_history = create_history_aware_retriever(llm_model, retriver, contextualized_system_prompt)

In [13]:
#create prompt to for the context and history 
qa_prompt = ChatPromptTemplate.from_messages([
    ('system', 'You are a financial analyst who helps users understand financial data and answer queries.'),
    (MessagesPlaceholder('chat_history')),
    ('human', 'Here is the data:\n{context}\n\nQuestion: {input}')
])

In [14]:
#chain llm dan qa_prompt
llm_context_chain = create_stuff_documents_chain(llm_model, qa_prompt)

In [15]:
#chain the retriever aware history and llm with context
rag_chain = create_retrieval_chain(retriever_aware_history, llm_context_chain)

### Chat History in list
- hanya satu pengguna
- belum butuh penyimpanan dalam jangka panjang
- hanya satu arah

In [34]:
chat_history = []

question = 'Give me insight about loan approvals for low income applicants.'
response = rag_chain.invoke({
    'input' : question, 
    'chat_history' : chat_history
})
print(response['answer'])

chat_history.extend([
    (HumanMessage(content=question)),
    (AIMessage(content=response['answer']))
])

Based on the provided data, I'll analyze the loan approvals for low-income applicants.

**Low-Income Applicants:**
To define low-income applicants, let's assume an ApplicantIncome below 4000. From the data, we have:

1. LP001034: ApplicantIncome = 3596, Loan_Status = Y (Approved)
2. LP001770: ApplicantIncome = 3189, Loan_Status = Y (Approved)

**Insights:**

* Both low-income applicants (LP001034 and LP001770) have been approved for loans.
* Despite having a lower ApplicantIncome, LP001770 has a CoapplicantIncome of 2598, which might have positively impacted the loan approval.
* LP001034 has a relatively low LoanAmount of 100, which might have made it more feasible for the applicant to repay.

**Common Characteristics of Approved Low-Income Applicants:**

* Both are male.
* Not graduates (Education: Not Graduate).
* Not self-employed (Self_Employed: No).
* Have a relatively low LoanAmount compared to their ApplicantIncome.

**Critical Factors for Low-Income Applicants:**

* Credit_Hist

In [36]:
print(len(chat_history))

2


In [32]:
question = 'Make it more simple'
response = rag_chain.invoke({
    'input' : question, 
    'chat_history' : chat_history
})
print(response['answer'])

Let's break down the loan approvals for low-income applicants:

**Low-Income Applicants:**

* LP001034: ApplicantIncome = 3596, CoapplicantIncome = 0, Loan_Status = Approved (Y)
* LP001586: ApplicantIncome = 3522, CoapplicantIncome = 0, Loan_Status = Not Approved (N)
* LP001722: ApplicantIncome = 150, CoapplicantIncome = 1800, Loan_Status = Not Approved (N)

**Insights:**

1. **Income matters, but not always**: LP001034 was approved with a low ApplicantIncome, while LP001586 was not approved with a similar income.
2. **Coapplicant income can help**: LP001722 had a low ApplicantIncome, but a high CoapplicantIncome, and still wasn't approved.
3. **Credit History and other factors play a role**: LP001586 and LP001722 had a good Credit_History (1), but still weren't approved.

**Simple Takeaways:**

* Low-income applicants can get approved (LP001034).
* Income and CoapplicantIncome are important, but not the only factors.
* Credit History and other factors like Property_Area and LoanAmount

In [37]:
question = 'What is LLM ?'
response = rag_chain.invoke({
    'input' : question, 
    'chat_history' : chat_history
})
print(response['answer'])

LLM stands for Large Language Model. It refers to a type of artificial intelligence (AI) model designed to process and understand human language. LLMs are trained on vast amounts of text data and can generate human-like responses to a wide range of questions and prompts.

In the context of our conversation, I'm an example of an LLM, as I'm a computer program designed to understand and respond to natural language inputs. I'm trained on a massive dataset of text and can generate responses to various questions and topics, including financial analysis and insights, as we discussed earlier.

Some common applications of LLMs include:

1. Virtual assistants: LLMs can power virtual assistants, such as chatbots or voice assistants, to provide customer support or answer frequently asked questions.
2. Language translation: LLMs can be used for language translation, allowing users to communicate across languages.
3. Text summarization: LLMs can summarize long pieces of text into concise, easily di

### Chat History with configuration Session id 

- membuat chat dapat menyimpan dan mengingat riwayat obrolan di tiap sesi
- dapat menyimpan dan memanggil riwayat dari tiap sesi tanpa bentrok
- RunnableWithMessageHistory
- cocok digunakan untuk aplikasi multi user (streamlit, fastapi)
- penyimpanan jangka panjang
- history otomatis terpanggil
- bebas berapa banyak history yang diingat

In [16]:
#set up config 
store = {}
def get_config_history(session_id) -> BaseChatMessageHistory : 
    if session_id not in store : 
        store[session_id] = ChatMessageHistory()
    return store[session_id]

In [None]:
#adding history feature to rag 
rag_with_config_history = RunnableWithMessageHistory(rag_chain, get_config_history, input_messages_key='input', history_messages_key='chat_history', output_messages_key='answer')
rag_with_config_history

RunnableWithMessageHistory(bound=RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  chat_history: RunnableBinding(bound=RunnableLambda(_enter_history), kwargs={}, config={'run_name': 'load_history'}, config_factories=[])
}), kwargs={}, config={'run_name': 'insert_history'}, config_factories=[])
| RunnableBinding(bound=RunnableLambda(_call_runnable_sync), kwargs={}, config={'run_name': 'check_sync_or_async'}, config_factories=[]), kwargs={}, config={'run_name': 'RunnableWithMessageHistory'}, config_factories=[]), kwargs={}, config={}, config_factories=[], get_session_history=<function get_config_history at 0x0000020512A92FC0>, input_messages_key='input', output_messages_key='answer', history_messages_key='chat_history', history_factory_config=[ConfigurableFieldSpec(id='session_id', annotation=<class 'str'>, name='Session ID', description='Unique identifier for a session.', default='', is_shared=True, dependencies=None)])

In [22]:
config = {
    'configurable' : {
        'session_id' : 'chat1'
    }
}

In [24]:
rag_with_config_history.invoke(
    {'input': 'What kind person to let them approve their loan'},
    config={
    'configurable' : {
        'session_id' : 'chat1'
    }
}
)['answer']

'Based on the provided text, the following types of individuals or entities may need to approve a loan:\n\n1. **Governmental agencies**\n2. **Preexisting secured parties** (e.g., existing lenders or creditors)\n3. **Landlords** and other **lessees** (if the business is leasing property or equipment)\n4. **Equity investors** (e.g., shareholders or owners who have invested in the business)\n\nAdditionally, borrowers should also consult with their own:\n\n1. **Legal counsel** (lawyer or attorney)\n2. **Bankers**\n3. **Accountants**\n4. **Other professional advisors**\n\nThese individuals can help borrowers navigate the loan process and ensure they understand the terms and conditions of the loan.'

In [26]:
response = rag_with_config_history.invoke(
    {'input': 'How the person requirements if their loan is approaved'},
    config={
    'configurable' : {
        'session_id' : 'chat1'
    }
}
)['answer']

print(response)

Based on the provided data, I'll analyze the characteristics of individuals whose loans were approved (Loan_Status: Y) and provide insights on the requirements for loan approval.

**Approved Loans (Loan_Status: Y)**

1. **LP001034**
* Gender: Male
* Married: No
* Dependents:1
* Education: Not Graduate
* Self_Employed: No
* ApplicantIncome:3596
* CoapplicantIncome:0
* LoanAmount:100
* Credit_History: (missing value)
* Property_Area: Urban

2. **LP001819**
* Gender: Male
* Married: Yes
* Dependents:1
* Education: Not Graduate
* Self_Employed: No
* ApplicantIncome:6608
* CoapplicantIncome:0
* LoanAmount:137
* Credit_History:1
* Property_Area: Urban

**Common Characteristics of Approved Loans**

1. **Gender**: Both approved loans are for males.
2. **Education**: Both have not graduated.
3. **Self_Employed**: Neither is self-employed.
4. **Property_Area**: Both are from urban areas.
5. **Credit_History**: One has a Credit_History of1, and the other has a missing value.

**Requirements for L