In [1]:
!pip install langchain langchain-community langchain-huggingface chromadb sentence-transformers

Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-1.2.0-py3-none-any.whl.metadata (2.8 kB)
Collecting chromadb
  Downloading chromadb-1.5.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting huggingface-hub<1.0.0,>=0.33.4 (from langchain-huggingface)
  Downloading huggingface_hub-0.36.2-py3-none-any.whl.metadata (15 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.4.0-py3-none-any.whl.metadata (5.8 kB)
Collect

In [3]:
import os
from langchain_community.document_loaders import TextLoader

# 1. Create a dummy policy file
policy_text = """
TechCorp Employee Leave Policy 2024

1. Casual Leave (CL):
All employees are eligible for 10 days of Casual Leave per calendar year.
CL must be applied for at least 2 days in advance.
Unused CL lapses at the end of the year and cannot be carried forward.

2. Sick Leave (SL):
Employees are entitled to 7 days of Sick Leave per year.
For SL exceeding 2 consecutive days, a medical certificate from a registered practitioner is mandatory.
Unused SL can be accumulated up to 45 days.

3. Privilege Leave (PL):
Employees earn 1.5 days of PL for every month of service.
PL eligibility starts after the completion of the probation period (6 months).
PL can be encashed at the time of separation.

4. Remote Work:
Employees are allowed 2 days of remote work per week with manager approval.
"""

with open("leave_policy.txt", "w") as f:
    f.write(policy_text)

# 2. Load the data
loader = TextLoader("leave_policy.txt")
documents = loader.load()

print(f"Loaded {len(documents)} document(s).")



Loaded 1 document(s).


In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Define the splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,      # Small chunk size for this specific short example
    chunk_overlap=20,    # Small overlap
    separators=["\n\n", "\n", " ", ""]
)

# Split the documents
chunks = text_splitter.split_documents(documents)

print(f"Split into {len(chunks)} chunks.")
print(f"Sample Chunk: {chunks[1].page_content}")

Split into 8 chunks.
Sample Chunk: 1. Casual Leave (CL):
All employees are eligible for 10 days of Casual Leave per calendar year. 
CL must be applied for at least 2 days in advance.


In [6]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# 1. Initialize Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 2. Create Vector Store (Chroma)
# This will embed the chunks and store them in memory
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    collection_name="policy_collection"
)

print("Vector Database created successfully.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector Database created successfully.


In [9]:
query1 = "What happens to my unused Casual Leave at the end of the year?"
response, docs = rag_pipeline(query1)
print(response)

**Context Retrieved for Generation:**
1. Casual Leave (CL):
All employees are eligible for 10 days of Casual Leave per calendar year. 
CL must be applied for at least 2 days in advance.
Unused CL lapses at the end of the year and cannot be carried forward.

(This context is passed to the LLM to generate the final natural language answer.)


In [7]:
# Function to simulate the LLM Generation step
# In a real scenario, you would use: llm = ChatOpenAI()
def generate_response(query, retrieved_docs):
    context = "\n".join([doc.page_content for doc in retrieved_docs])

    # Simple template for demonstration
    prompt = f"""
    You are a helpful HR Assistant. Use the context below to answer the user query.

    Context:
    {context}

    User Query: {query}

    Answer:
    """

    # This acts as the "LLM" for the assignment if no API key is available
    # It returns the context that would be sent to the LLM
    return f"**Context Retrieved for Generation:**\n{context}\n\n(This context is passed to the LLM to generate the final natural language answer.)"

In [10]:
# The Retrieval Function
def rag_pipeline(query):
    # 1. Retrieve top 2 most relevant chunks
    retriever = vector_db.as_retriever(search_kwargs={"k": 2})
    retrieved_docs = retriever.invoke(query)

    # 2. Generate (Simulated)
    response = generate_response(query, retrieved_docs)
    return response, retrieved_docs

In [11]:
query1 = "What happens to my unused Casual Leave at the end of the year?"
response, docs = rag_pipeline(query1)
print(response)

**Context Retrieved for Generation:**
1. Casual Leave (CL):
All employees are eligible for 10 days of Casual Leave per calendar year. 
CL must be applied for at least 2 days in advance.
Unused CL lapses at the end of the year and cannot be carried forward.

(This context is passed to the LLM to generate the final natural language answer.)


In [12]:
query2 = "When do I need to submit a medical certificate?"
response, docs = rag_pipeline(query2)
print(response)

**Context Retrieved for Generation:**
2. Sick Leave (SL):
Employees are entitled to 7 days of Sick Leave per year. 
For SL exceeding 2 consecutive days, a medical certificate from a registered practitioner is mandatory.
1. Casual Leave (CL):
All employees are eligible for 10 days of Casual Leave per calendar year. 
CL must be applied for at least 2 days in advance.

(This context is passed to the LLM to generate the final natural language answer.)


In [13]:
query3 = "How many days can I work from home?"
response, docs = rag_pipeline(query3)
print(response)

**Context Retrieved for Generation:**
4. Remote Work:
Employees are allowed 2 days of remote work per week with manager approval.
1. Casual Leave (CL):
All employees are eligible for 10 days of Casual Leave per calendar year. 
CL must be applied for at least 2 days in advance.

(This context is passed to the LLM to generate the final natural language answer.)
