In [1]:
!pip install pandas langchain faiss-cpu sentence-transformers transformers langchain-community

Collecting numpy>=1.21.0
  Using cached numpy-2.3.2-cp311-cp311-win_amd64.whl (13.1 MB)
  Downloading numpy-1.26.4-cp311-cp311-win_amd64.whl (15.8 MB)
     ---------------------------------------- 15.8/15.8 MB 2.8 MB/s eta 0:00:00
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.24.4
    Uninstalling numpy-1.24.4:
      Successfully uninstalled numpy-1.24.4
Successfully installed numpy-1.26.4



[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd

# Load the Kaggle loan dataset
df = pd.read_csv("Training Dataset.csv")

# Drop missing values
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# Preview the dataset
df.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y


In [3]:
documents = []

for idx, row in df.iterrows():
    text = (
        f"Applicant ID: {row['Loan_ID']}. "
        f"Gender: {row['Gender']}, Married: {row['Married']}, Dependents: {row['Dependents']}, "
        f"Education: {row['Education']}, Self_Employed: {row['Self_Employed']}, "
        f"ApplicantIncome: {row['ApplicantIncome']}, CoapplicantIncome: {row['CoapplicantIncome']}, "
        f"LoanAmount: {row['LoanAmount']}, Loan_Amount_Term: {row['Loan_Amount_Term']}, "
        f"Credit_History: {row['Credit_History']}, Property_Area: {row['Property_Area']}, "
        f"Loan_Status: {row['Loan_Status']}."
    )
    documents.append(text)

# Check one sample
print("Sample document:\n", documents[0])


Sample document:
 Applicant ID: LP001003. Gender: Male, Married: Yes, Dependents: 1, Education: Graduate, Self_Employed: No, ApplicantIncome: 4583, CoapplicantIncome: 1508.0, LoanAmount: 128.0, Loan_Amount_Term: 360.0, Credit_History: 1.0, Property_Area: Rural, Loan_Status: N.


In [4]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(documents)
embeddings = np.array(embeddings).astype("float32")

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)


  return forward_call(*args, **kwargs)


In [12]:
import pandas as pd
import pickle
from transformers import pipeline
from langchain.schema import Document
from langchain.docstore import InMemoryDocstore
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Step 0: Load FAISS index and raw document texts
with open("rag_index.pkl", "rb") as f:
    index, raw_docs = pickle.load(f)

# Step 1: Convert raw documents to LangChain Documents
documents = [Document(page_content=doc) for doc in raw_docs]
documents_dict = {str(i): documents[i] for i in range(len(documents))}
docstore = InMemoryDocstore(documents_dict)
index_to_docstore_id = {i: str(i) for i in range(len(documents))}

# Step 2: Load embeddings
doc_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Step 3: Rebuild FAISS vector store
vectorstore = FAISS(
    embedding_function=doc_embeddings,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id
)

# Step 4: Load the LLM (Flan-T5)
gen_pipeline = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    tokenizer="google/flan-t5-base",
    max_new_tokens=100
)
llm = HuggingFacePipeline(pipeline=gen_pipeline)

# Step 5: Build the QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
)

# Step 6: Custom logic for specific query
query = "What is the average loan amount for approved applicants?"

if "average loan amount" in query.lower() and "approved" in query.lower():
    # Load CSV directly
    df = pd.read_csv("Training Dataset.csv")
    approved_df = df[df["Loan_Status"] == "Y"]
    average_loan = approved_df["LoanAmount"].mean()
    print("💬 The average loan amount for approved applicants is:", round(average_loan, 2))
else:
    response = qa_chain.invoke({"query": query})
    print("💬", response["result"].strip())


Device set to use cpu


💬 The average loan amount for approved applicants is: 144.29


In [1]:
query = "What is the average loan amount for approved applicants?"

if "average loan amount" in query.lower() and "approved" in query.lower():
    import pandas as pd
    df = pd.read_csv("Training Dataset.csv")
    approved_df = df[df["Loan_Status"] == "Y"]
    avg_loan = approved_df["LoanAmount"].dropna().mean()
    print("💬 The average loan amount for approved applicants is:", round(avg_loan, 2))
else:
    result = qa_chain.invoke({"query": query})
    print("💬", result["result"])


💬 The average loan amount for approved applicants is: 144.29


In [3]:
import pandas as pd
import pickle
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document

# Step 1: Load your data
df = pd.read_csv("Training Dataset.csv")

# Step 2: Convert rows into textual documents
raw_docs = []
for i, row in df.iterrows():
    text = ", ".join(f"{col}: {row[col]}" for col in df.columns)
    raw_docs.append(text)

# Step 3: Create LangChain Document objects
documents = [Document(page_content=doc) for doc in raw_docs]

# Step 4: Create embeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Step 5: Build FAISS index
vectorstore = FAISS.from_documents(documents, embedding=embeddings)

# Step 6: Save index and raw_docs
with open("rag_index.pkl", "wb") as f:
    pickle.dump((vectorstore.index, raw_docs), f)

print("✅ rag_index.pkl saved successfully.")


  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  return forward_call(*args, **kwargs)


✅ rag_index.pkl saved successfully.


In [4]:
approved = df[df['Loan_Status'] == 'Y']
avg_loan = approved['LoanAmount'].mean()
print("Average Loan Amount for Approved Applicants:", round(avg_loan, 2))


Average Loan Amount for Approved Applicants: 144.29
