# NEW APPROACH\

In [1]:

# Importing required libraries
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader
import os

# Specify the folder containing PDF files
pdf_folder = "Data/"  # Replace with the path to your folder

# Initialize a list to hold all chunks
all_chunks = []

# Define the chunk size and overlap
chunk_size = 500  # Number of characters in each chunk
chunk_overlap = 50  # Overlapping characters between chunks

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

# Loop through all PDFs in the folder
for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, filename)
        print(f"Processing file: {filename}")

        # Read the PDF
        pdf_reader = PdfReader(pdf_path)

        # Extract text from all pages
        pdf_text = ""
        for page in pdf_reader.pages:
            pdf_text += page.extract_text()

        # Split the text into chunks
        chunks = text_splitter.split_text(pdf_text)
        print(f"Extracted {len(chunks)} chunks from {filename}")

        # Append chunks to the list
        all_chunks.extend(chunks)

print(f"Total chunks extracted: {len(all_chunks)}")


Processing file: Attendance and Punctuality_final.pdf
Extracted 16 chunks from Attendance and Punctuality_final.pdf
Processing file: Compensatory Off Policy_final.pdf
Extracted 6 chunks from Compensatory Off Policy_final.pdf
Processing file: Confirmation Policy-new.pdf
Extracted 6 chunks from Confirmation Policy-new.pdf
Processing file: EnFuse - Code of Conduct_July24 CS (1)_new.pdf
Extracted 49 chunks from EnFuse - Code of Conduct_July24 CS (1)_new.pdf
Processing file: Leave Policy_Updated_new.pdf
Extracted 18 chunks from Leave Policy_Updated_new.pdf
Total chunks extracted: 95


In [2]:
print(all_chunks)



In [8]:


# Importing required libraries
from sentence_transformers import SentenceTransformer
import faiss
import pickle

# Load the pre-trained embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each chunk
print("Generating embeddings for chunks...")
chunk_embeddings = embedding_model.encode(all_chunks, show_progress_bar=True)

# Convert embeddings to a format compatible with FAISS
chunk_embeddings = chunk_embeddings.astype("float32")

# Create a FAISS index and add the embeddings
print("Storing embeddings in FAISS index...")
index = faiss.IndexFlatL2(chunk_embeddings.shape[1])  # L2 distance
index.add(chunk_embeddings)

# Save the FAISS index and metadata for later use
faiss_index_path = "embeddings/hr_policy_faiss.index"
metadata_path = "embeddings/hr_policy_faiss_metadata.pkl"

# Save the FAISS index
faiss.write_index(index, faiss_index_path)

# Save the metadata (chunks) for mapping back to the original content
with open(metadata_path, 'wb') as metadata_file:
    pickle.dump(all_chunks, metadata_file)

print(f"FAISS index saved at {faiss_index_path}")
print(f"Metadata saved at {metadata_path}")


Generating embeddings for chunks...


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Storing embeddings in FAISS index...
FAISS index saved at embeddings/hr_policy_faiss.index
Metadata saved at embeddings/hr_policy_faiss_metadata.pkl


In [51]:
# Import required libraries
import faiss
from sentence_transformers import SentenceTransformer
import pickle

# Paths to FAISS index and metadata
faiss_index_path = "embeddings/hr_policy_faiss.index"
metadata_path = "embeddings/hr_policy_faiss_metadata.pkl"

# Load the FAISS index
print("Loading FAISS index...")
index = faiss.read_index(faiss_index_path)

# Load metadata (original chunks)
print("Loading metadata...")
with open(metadata_path, 'rb') as metadata_file:
    chunks = pickle.load(metadata_file)

# Load the same embedding model used for creating the index
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Define the retrieval function
def retrieve_relevant_chunks(query, top_k=5):
    # Generate embedding for the query
    query_embedding = embedding_model.encode([query]).astype("float32")

    # Search for top_k most similar chunks
    distances, indices = index.search(query_embedding, top_k)

    # Map indices to chunks
    results = []
    for idx, dist in zip(indices[0], distances[0]):
        if idx != -1:  # Valid index
            results.append((chunks[idx], dist))

    return results

# Example query
query = "What is the probation period for new hires?"
top_k = 5  # Number of relevant chunks to retrieve

# Retrieve results
retrieved_chunks = retrieve_relevant_chunks(query, top_k=top_k)

# Display results
print(f"\nQuery: {query}")
print("\nTop Relevant Chunks:")
for i, (chunk, distance) in enumerate(retrieved_chunks, 1):
    print(f"\nChunk {i}:")
    print(f"Content: {chunk}")
    print(f"Distance: {distance:.4f}")


Loading FAISS index...
Loading metadata...

Query: What is the probation period for new hires?

Top Relevant Chunks:

Chunk 1:
Content: 2.5.2.  New Hire Probation Period   
An employee within his/her probation period who has two occurrences will receive a Written 
employment may be terminated; should the probation period be extended, this ru le still 
applies.   
2.5.3.  Holidays   
If an employee calls out of work the day before, the day of, or the day after a holiday, the
Distance: 0.8029

Chunk 2:
Content: 1.4.  Process  
The standard probation duration lasts three months from the date of joining.  
The confirmation process initiates ten days before the actual confirmation due date.  
▪ The Manager should maintain employee assessment / feedback  
▪ In case the employee clears the probation successfully, the Manager can share the 
feedback with HR as a response to the probation alert email  
▪ HR then confirms the employee in the system and shares the confirmation letter
Distance: 0.

In [52]:
import os

# Import required libraries
from langchain.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

# Use a pre-trained T5 model from Hugging Face for question answering
# You can replace 't5-base' with any other suitable model
hf_model = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",  # T5 model for text-to-text tasks, you can use any other model
    task="text2text-generation"
)

# Set up the prompt template for your HR policy Q&A
prompt_template = """
You are an AI assistant trained to answer questions about HR policies.
Given the following context and user query, provide a short and accurate answer.

Context:
{context}

Query:
{query}

Answer:
"""

# Initialize the prompt template
prompt = PromptTemplate(
    input_variables=["context", "query"],
    template=prompt_template
)

# Create a chain using the Hugging Face model
llm_chain = LLMChain(llm=hf_model, prompt=prompt)

# Example usage
query = "What is the probation period for new hires?"

# Run the chain with context and query
# response = llm_chain.run(context=context, query=query)
# print("\nResponse from the chain:")
# print(response)


In [53]:
response = llm_chain.run(context=retrieved_chunks, query=query)

In [54]:
print(response)


You are an AI assistant trained to answer questions about HR policies.
Given the following context and user query, provide a short and accurate answer.

Context:

Query:
What is the probation period for new hires?

Answer:
The standard probation duration lasts three months from the date of joining.


In [15]:
# Read index files

import pickle

file_path = r'D:\RAG\embeddings\hr_policy_faiss_metadata.pkl'

with open(file_path, 'rb') as file:
    metadata = pickle.load(file)

print(metadata)    



In [16]:
metadata

['1. Policy Brief   \nTimely and regular attendance is an expectation of performance for all EnFuse employees. \nTo ensure adequate staffing, positive employee morale, and to meet expected productivity \nstandards throughout the organization, employees will be held accountable for adhering to',
 'their work schedule. If an employee is unable to meet this expectation, he/she must obtain \napproval from their Manager in advance of any requested schedule changes. This approval \nincludes requests to use appropriate accruals and late arrivals t o or early departures from',
 'work. Departments had the discretion to evaluate extraordinary circumstances of a tardy, \nabsence, or failure to check -in or check -out and determine whether or not to count the \nincident as an occurrence. The HR team is available t o advise Managers regarding the',
 'evaluation of extenuating circumstances.       \n 2. Procedures   \n2.1.   Absent   \nAn employee is deemed absent when he/she is unavailable for work

In [67]:
import faiss

index_file_path = 'D:\RAG\embeddings\hr_policy_faiss.index'

try :
    index = faiss.read_index(index_file_path)
    print(index)
except Exception as e:
    print("Error")    

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001C883F999B0> >


  index_file_path = 'D:\RAG\embeddings\hr_policy_faiss.index'


# MODEL EVALUATION USING RAGAs

In [61]:
from src.pdf_extraction import extract_text_from_pdf, extract_text_from_pdfs

data = extract_text_from_pdfs(r'./Data')

Processing file: Attendance and Punctuality_final.pdf
Processing file: Compensatory Off Policy_final.pdf
Processing file: Confirmation Policy-new.pdf
Processing file: EnFuse - Code of Conduct_July24 CS (1)_new.pdf
Processing file: Leave Policy_Updated_new.pdf


In [99]:

source_text = data[4]
source_text

" \n Leave Policy  \n \nPolicy Document  \n1. Introduction  \n \n1.1.  Objectives  \nThis guideline outlines the leaves available for Full -time and Contract employees of EnFuse \nSolutions Ltd. It also outlines employee eligibility, leave application process, carry over and \ntreatment of unused leaves.  \n1.2.  Scope and Applicability  \nThis guideline applies to all employees and probationers of EnFuse Solutions. The annual \nleave calendar runs from April 1st and ends on March 31st.   \n  \n  \n    \n  2. Leaves for Full -time Regular Employees  \n \nAt the start of every month, 2 paid leaves will be accrued to the employee's leave account \nfor joiners between 1st and 15th and 1 paid leave will be accrued to the employee's leave \nbalance account for joiners between 16th to 30th/31st. If the leave is not availed in a month, \nthe same shall be carried forward to the next month.  \n▪ The Employee is entitled to two days off in a week (5 working days)  \n▪ Employee is entitled to 24

In [96]:
from src.chunking import split_text_into_chunks

new = split_text_into_chunks(source_text)
new

["Compensatory Off Policy  \n \n1.  Compensatory Off Policy  \n \n1.1.  Introduction  \nThe Compensatory Off Policy document aims to define EnFuse's approach to the concept of  \nCompensatory policy. This document will act as a guideline, and it is expected that all the \nEnFusians abide by the same.",
 '1.2.  Definition of Compensatory Off  \nAlso known as Comp Off, Compensatory Off is an optional way of providing leave to \nemployees who work on a holiday or a week off. The Compensatory Off policy provides \nemployees with leaves to be compensated by a holiday or a week off work. Any such',
 'approved  Holiday / Week off working would be marked as a compensatory off in the system. \nIt can be availed during the eligibility period (provided all the eligibility / validity criteria are \nqualifying).  \n1.3.  Eligibility for Applying for Compensatory Off',
 'The Employees can only apply for Compensatory Off if they fulfill the following two criteria:  \n▪ The Employee should have worked

In [48]:
from huggingface_hub import login
login(token = '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')

In [None]:
import random
from transformers import pipeline, T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("wiselinjayajos/t5-end2end-questions-generation")
# Initialize the question-generation pipeline
question_generator = pipeline("text2text-generation", model="wiselinjayajos/t5-end2end-questions-generation")

Device set to use cpu


In [89]:
# Function to generate synthetic data from a list of text fragments
def generate_synthetic_data(source_text_list, max_questions_per_text=10):
    synthetic_data = []

    for source_text in source_text_list:
        # Split the text into logical chunks (e.g., paragraphs or sections)
        source_data = source_text.split('\n\n')

        for i, item in enumerate(source_data):
            if max_questions_per_text is not None and i >= max_questions_per_text:
                break

            # Generate a question based on the paragraph/section
            generated = question_generator(f"generate question: {item}", max_length=64, num_return_sequences=1)
            question = generated[0]['generated_text']

            # Use the paragraph/section as context and extract a ground truth answer
            context = item
            answer = "Not specified"  # Replace with a more sophisticated answer extraction logic if needed

            synthetic_data.append({
                "question": question,
                "context": context,
                "ground_truth": answer
            })

    return synthetic_data

# Generate synthetic dataset with a maximum of 10 questions per text fragment
synthetic_dataset = generate_synthetic_data(new, max_questions_per_text=10)

# Display the generated questions and context
for i, item in enumerate(synthetic_dataset, 1):
    print(f"Sample {i}:")
    print(f"  Question: {item['question']}")
    print(f"  Context: {item['context']}\n")
    print(f"  Ground Truth: {item['ground_truth']}\n")


Sample 1:
  Question: What is the name of the document that defines EnFuse's approach to the concept of Compensatory policy? What is the name of the document that defines EnFuse's approach to the concept of Compensatory policy? What is the name of the document that defines EnF
  Context: Compensatory Off Policy  
 
1.  Compensatory Off Policy  
 
1.1.  Introduction  
The Compensatory Off Policy document aims to define EnFuse's approach to the concept of  
Compensatory policy. This document will act as a guideline, and it is expected that all the 
EnFusians abide by the same.

  Ground Truth: Not specified

Sample 2:
  Question: What is an optional way of providing leave to employees who work on a holiday or a week off? What is an optional way of providing leave to employees who work on a holiday or a week off? What is an optional way of providing leave to employees who work on a holiday or
  Context: 1.2.  Definition of Compensatory Off  
Also known as Comp Off, Compensatory Off is an 

# RAGAS EVALUATION

i have questions in excel file and now i want to store the context and answers in that for evaluations


In [1]:
#Load FAISS index and metadata
from src.embedding import load_faiss_index, load_metadata
from src.retrieval import retrieve_relevant_chunks
from src.qa import create_llm_chain, get_answer

faiss_index_path = r'D:\RAG\embeddings\hr_policy_faiss.index'
metadata_path = r'D:\RAG\embeddings\hr_policy_faiss_metadata.pkl'
index = load_faiss_index(faiss_index_path)
metadata = load_metadata(metadata_path)
""
query = " Under what conditions can an employee apply for Compensatory Off? "
#Retrieve relevant chunks based on the query
retrieved_chunks = retrieve_relevant_chunks(query, index, metadata, top_k=2)
print(retrieved_chunks)

print("==========================================")

# LLM answer generation
# Step 7: Get the answer using the retrieved context
llm_chain = create_llm_chain()
context = "\n".join([chunk for chunk, _ in retrieved_chunks])
answer = get_answer(llm_chain, context, query)
output = answer['text']
print(output)

[('The Employees can only apply for Compensatory Off if they fulfill the following two criteria:  \n▪ The Employee should have worked for One Full Day - On account of a project \nrequirement pre -approved by the respective Manager  \n▪ The Employee should be a Full -time resource', 0.3624244), ('approved  Holiday / Week off working would be marked as a compensatory off in the system. \nIt can be availed during the eligibility period (provided all the eligibility / validity criteria are \nqualifying).  \n1.3.  Eligibility for Applying for Compensatory Off', 0.59751034)]


  return LLMChain(llm=hf_model, prompt=prompt)


 An employee can apply for Compensatory Off if they have worked for one full day on account of a project requirement pre-approved by their respective manager and are a full-time resource. The holiday/week off working would be marked as a compensatory off in the system and can be availed during the eligibility period.


# Code to get testing excel sheet

In [2]:
import pandas as pd
from src.embedding import load_faiss_index, load_metadata
from src.retrieval import retrieve_relevant_chunks
from src.qa import create_llm_chain, get_answer

# Load FAISS index and metadata
faiss_index_path = r'D:\RAG\embeddings\hr_policy_faiss.index'
metadata_path = r'D:\RAG\embeddings\hr_policy_faiss_metadata.pkl'

# Load the FAISS index and metadata
faiss_index = load_faiss_index(faiss_index_path)  # Rename the FAISS index variable
metadata = load_metadata(metadata_path)

# Load the Excel file containing the queries
excel_path = r'C:\Users\kartik.rathi_enfuse-\Desktop\test_data.xlsx'
df = pd.read_excel(excel_path)

# Initialize the LLM chain
llm_chain = create_llm_chain()

# Process each query in the 'Ques' column
for idx, row in df.iterrows():  # Use a different name for the DataFrame index (e.g., idx)
    query = row['Ques']

    # Retrieve relevant chunks based on the query using the FAISS index
    retrieved_chunks = retrieve_relevant_chunks(query, faiss_index, metadata, top_k=2)
    context = "\n".join([chunk for chunk, _ in retrieved_chunks])

    # Get the answer using the retrieved context
    answer = get_answer(llm_chain, context, query)
    output = answer['text']

    # Store the context and answer in the DataFrame
    df.at[idx, 'Context'] = context
    df.at[idx, 'Answer'] = output

# Save the updated DataFrame to the Excel file
df.to_excel(r'D:\RAG\queries_with_answers.xlsx', index=False)

print("Processing completed. Results saved to 'queries_with_answers.xlsx'.")


ImportError: cannot import name 'get_answer' from 'src.qa' (d:\RAG\src\qa.py)

# Evaluation

In [1]:
from ragas import SingleTurnSample
from ragas.metrics import NonLLMContextPrecisionWithReference

context_precision = NonLLMContextPrecisionWithReference()

sample = SingleTurnSample(
    retrieved_contexts=["The Eiffel Tower is located in Paris."], 
    reference_contexts=["Paris is the capital of France.", "The Eiffel Tower is one of the most famous landmarks in Paris."]
)

await context_precision.single_turn_ascore(sample)

0.9999999999

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM

class Mistral7B(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()

        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)[0]

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    def get_model_name(self):
        return "Mistral 7B"





In [None]:
from tempfile import TemporaryDirectory
from transformers import AutoModelForCausalLM, AutoTokenizer

with TemporaryDirectory() as tmpdirname:

    model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1",torch_dtype="float32")
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

# mistral_7b = Mistral7B(model=model, tokenizer=tokenizer)
# print(mistral_7b.generate("Write me a joke"))

# Query Rewriter

In [47]:
import google.generativeai as genai
from dotenv import load_dotenv
import os
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

In [48]:
generation_config = {
        'temperature': 0,
        'max_output_tokens': 8000,
        'top_p': 0.2
    }
model = genai.GenerativeModel(model_name='gemini-pro', generation_config=generation_config)


In [49]:
query = "should any female employee have to submit any document to take the maternity leave "
prompt_jd = f"""
    "Rewrite the given user query to make it precise and specific for retrieving accurate information from a 
    retrieval-augmented generation (RAG) system. Focus on HR policies and include all relevant details to ensure clarity.

    Args:
        user_query (str): Original query {query} from the user.

    Returns:
        str: Rewritten query for accurate retrieval.
    """

    # Step 3: Generate response using the model
response_jd = model.generate_content([prompt_jd])
response_text = response_jd.parts

In [50]:
print(response_text[0].text)

Retrieve HR policies related to maternity leave for female employees, including any specific documentation requirements for taking leave.


# QA Chnage

In [None]:
import os
from langchain_huggingface import HuggingFaceEndpoint
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import StructuredOutputParser
from langchain_core.output_parsers import StrOutputParser


def chain_output(context, re_query, model_repo_id="mistralai/Mistral-7B-Instruct-v0.3"):

    os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

    hf_model = HuggingFaceEndpoint(
        repo_id=model_repo_id,
        task="text2text-generation"
        )


    prompt_template = """
    You are an AI assistant trained to answer questions about HR policies in structured format.
    Given the following context and user query, provide a concise and accurate answer.Dont include headlines.

    Context:
    {context}

    Query:
    {query}

    Answer:
    """

    prompt = PromptTemplate(
        input_variables=["context", "re_query"],
        template=prompt_template
    )

    chain = prompt | hf_model | StrOutputParser()

    return chain.invoke({'context': context, 'query': re_query})

In [55]:
re_query = "What Should we do to take more than 3 days leave"
context = '''start day   
    ▪ 5 or more leaves: A leave request must be submitted at least 10 days before the leave 
    start day
    ▪ For 1 day's leave: A leave request must be raised at least 24 hours before the leave start 
    time   
    ▪ For 2 -3 days' leave: A leave request must be submitted at least 3 days before the leave 
    start day   
    ▪ For 4 -5 day's leave: A leave request must be submitted at least 6 days before the leave
    the same shall be carried forward to the next month.   
    ▪ The Employee is entitled to two days off in a week (5 working days)   
    ▪ Employee is entitled to 24 days of paid personal leaves in a year. Employees are'''

print(chain_output(re_query=re_query, context=context))

1. To take more than 3 days leave, a leave request must be submitted at least 3 days before the leave start date.
    2. If the leave is for 4 - 5 days, the request must be submitted at least 6 days before the leave start date.
    3. The leave shall be carried forward to the next month if not availed.
    4. Ensure you have sufficient personal leaves (24 days in a year) for the requested leave duration.


# ReWrite Query


In [7]:
from dotenv import load_dotenv
import google.generativeai as genai
import os
load_dotenv()

    # Configure the Generative AI API
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

    # Define the generation configuration
generation_config = {
        'temperature': 0,
        'max_output_tokens': 8000,
        'top_p': 0.2
    }

    # Initialize the model
model = genai.GenerativeModel(model_name='gemini-pro', generation_config=generation_config)

## Chunk

In [18]:
def extract_text_from_pdfs(pdf_folder):
    """Extract text from all PDFs in the folder."""
    all_text = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            print(f"Processing file: {filename}")
            pdf_text = extract_text_from_pdf(pdf_path)
            all_text.append(pdf_text)
    return all_text

In [22]:
import os
path = r"C:\Users\kartik.rathi_enfuse-\Documents\New_Project\Policies_chatbot\Data"
texts = extract_text_from_pdfs(path)

Processing file: Acceptable Use Policy (1)_new.pdf
Processing file: Attendance and Punctuality_new1.pdf
Processing file: Compensatory Off Policy_new.pdf
Processing file: Confirmation Policy_new.pdf
Processing file: Diversity and Inclusion Policy_new.pdf
Processing file: Dress Code Policy_new1.pdf
Processing file: EnFuse - Code of Conduct_July24 CS (1)_new.pdf
Processing file: EnFuse Social Media and Digital Communications Policy-V3_new.pdf
Processing file: Equal Employment Opportunity Policy_new.pdf
Processing file: Exit Process Flow (1)_new.pdf
Processing file: Internal Job Posting_Internal Transfer Policy (1)_new.pdf
Processing file: Leave Policy_Updated (1)_new1.pdf
Processing file: Non-Discrimination and Anti-Harassment Policy_V3_new.pdf
Processing file: Performance Appraisal and Development Policy_new.pdf
Processing file: Prevention of Sexual Harassment (POSH) of Women at the Workplace Policy_V3_new.pdf


In [16]:
import re

def chunk_document_by_sections(document_text):
    """
    Split the document into sections based on numbering (e.g., 1., 2., etc.).
    Returns a list of chunks, where each chunk corresponds to a section.
    """
    # Regular expression to match section headers (e.g., "1.", "2.", etc.)
    section_pattern = re.compile(r'\n\s*\d+\.\s+')

    # Split the document into sections using the section pattern
    sections = re.split(section_pattern, document_text)

    # Remove any leading/trailing whitespace from each section
    sections = [section.strip() for section in sections if section.strip()]

    return sections

# Chunk the document into sections
chunks = chunk_document_by_sections(document)

chunks

['Acceptable Use  \n \nPolicy Document',
 'Overview  \n \nThe Company provides access to the vast information resources of the Internet to help its \nemployees execute business functions efficiently, and be a well -informed business citizen. \nThe facilities to provide that access represent a considerable commitment o f company \nresources for telecommunications, networking, software and storage. This acceptable use \npolicy sets the Company’s expectations for the use of all such resources.   \nWhile we’ve set forth explicit requirements for Company assets and Internet usage below, \nwe’d like to start by describing our Internet usage philosophy. First and foremost, the Internet \nis a business tool for the Company provided to you at significant cost. That means we expect \nyou to use Company assets and Internet resources primarily for business -related purposes,  \ni.e., to communicate with clients and suppliers, to research relevant topics and obtain useful \nbusiness information. We

In [23]:
import re

def chunk_documents_by_sections(documents_text):
    """
    Split each document in the list into sections based on numbering (e.g., 1., 2., etc.).
    Returns a list of lists, where each inner list corresponds to the sections of a document.
    """
    # Regular expression to match section headers (e.g., "1.", "2.", etc.)
    section_pattern = re.compile(r'\n\s*\d+\.\s+')

    # Initialize an empty list to hold the chunks for each document
    all_chunks = []

    # Iterate over the list of document texts
    for document_text in documents_text:
        # Split the document into sections using the section pattern
        sections = re.split(section_pattern, document_text)

        # Remove any leading/trailing whitespace from each section
        sections = [section.strip() for section in sections if section.strip()]

        # Append the chunks (sections) for the current document to the all_chunks list
        all_chunks.append(sections)

    return all_chunks


# Chunk the documents into sections
all_chunks = chunk_documents_by_sections(texts)

# Print the chunks for each document
for i, chunks in enumerate(all_chunks, start=1):
    print(f"Document {i}:")
    for j, chunk in enumerate(chunks, start=1):
        print(f"  Chunk {j}:\n{chunk}\n{'-'*40}")


Document 1:
  Chunk 1:
Acceptable Use  
 
Policy Document
----------------------------------------
  Chunk 2:
Overview  
 
The Company provides access to the vast information resources of the Internet to help its 
employees execute business functions efficiently, and be a well -informed business citizen. 
The facilities to provide that access represent a considerable commitment o f company 
resources for telecommunications, networking, software and storage. This acceptable use 
policy sets the Company’s expectations for the use of all such resources.   
While we’ve set forth explicit requirements for Company assets and Internet usage below, 
we’d like to start by describing our Internet usage philosophy. First and foremost, the Internet 
is a business tool for the Company provided to you at significant cost. That means we expect 
you to use Company assets and Internet resources primarily for business -related purposes,  
i.e., to communicate with clients and suppliers, to research rele

# Router 

In [10]:
import google.generativeai as genai
from dotenv import load_dotenv
import os

In [21]:
def rewrite_query_for_rag(query):
    # Load environment variables
    load_dotenv()

    # Configure the Generative AI API
    genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

    # Define the generation configuration
    generation_config = {
        'temperature': 0,
        'max_output_tokens': 8000,
        'top_p': 0.2
    }

    # Initialize the model
    model = genai.GenerativeModel(model_name='gemini-pro', generation_config=generation_config)

    # Define the prompt
    prompt = f"""
    Behave as a HR chatbot .If it is greeting ,make a reply to the {query}"
    """

    # Generate response
    response_jd = model.generate_content([prompt])

    # Extract and return the text
    return response_jd.parts[0].text

In [22]:
rewrite_query_for_rag("How are you")

"**If the input is a greeting:**\n\n**User:** Hello, how are you?\n\n**Chatbot:** Hello there! I'm doing well, thank you for asking. How can I assist you today?"

# Testing chunking 

In [13]:
from PyPDF2 import PdfReader
def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF."""
    pdf_reader = PdfReader(pdf_path)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text


In [14]:
text = extract_text_from_pdf(r"D:\RAG\Data\EnFuse - Code of Conduct_July24 CS (1)_new1.pdf")
type(text)

str

In [15]:
def split_text_into_chunks(text, chunk_size=650, chunk_overlap=70):
    """Split text into smaller chunks without breaking sentences."""
    # Regular expression to match sentence boundaries (., ?, !, etc.)
    sentence_endings = re.compile(r'([.!?])\s+')

    # Split the text into sentences based on sentence-ending punctuation
    sentences = sentence_endings.split(text)

    # Reconstruct the sentences correctly
    sentences = [sentences[i] + sentences[i+1] for i in range(0, len(sentences)-1, 2)]

    # Add the last sentence if the text ended without punctuation at the end
    if len(sentences) * 2 != len(sentence_endings.findall(text)):
        sentences.append(sentences[-1] + sentences[-2])

    chunks = []
    current_chunk = ""

    for sentence in sentences:
        # Check if adding this sentence will exceed chunk_size
        if len(current_chunk) + len(sentence) + 1 > chunk_size:
            # If it exceeds, save the current chunk and start a new one
            chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            # Otherwise, append the sentence to the current chunk
            if current_chunk:
                current_chunk += " " + sentence
            else:
                current_chunk = sentence

    # Add any remaining content as the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

In [16]:
import re
final = split_text_into_chunks(text, chunk_size=650, chunk_overlap=70)

In [17]:
for i in final:
    print(i)
    print("===============================")

Code of Conduct     
    
Policy Document     
    
1. Message from the Co -Founder & Director     
    
Our commitment to ethical behavior and the solid ethical foundations are one of the most 
essential components of EnFuse Solutions Ltd. operation. We are committed to doing 
business the right way, based on a culture of ethics and compliance. In the long term, we can successfully face the challenges of a competitive market 
environment by accepting the imperatives of moral responsibility, both as individuals and as a 
company.
In performing the job duties, the employees should always act lawfully , ethically 
and in the best interests of the EnFuse Solutions Ltd. Thank you for upholding our values and helping us do things right. It does not only mean that 
we provide well made, fairly priced and of exceptional quality products and services, but it 
also means that ethics and integrity is always born in mind. We sourc e material only from 
suppliers who have impeccable human rights a

In [18]:
import re

def split_text_into_chunks(text, chunk_size=1200, chunk_overlap=120):
    """Split text into smaller chunks with overlap while preserving sentence boundaries."""

    # Regular expression to match sentence boundaries (., ?, !, etc.)
    sentence_endings = re.compile(r'([.!?])\s+')

    # Split the text into sentences while keeping the punctuation marks
    sentences = sentence_endings.split(text)
    sentences = [sentences[i] + sentences[i+1] for i in range(0, len(sentences)-1, 2)]

    # Ensure the last sentence is added if text ends without punctuation
    if len(sentences) * 2 != len(sentence_endings.findall(text)):
        sentences.append(sentences[-1] + sentences[-2])

    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence)

        # If adding the sentence exceeds chunk size, finalize the current chunk
        if current_length + sentence_length > chunk_size:
            chunks.append(" ".join(current_chunk).strip())

            # Implement overlap: Keep last 'chunk_overlap' characters for the next chunk
            overlap_text = current_chunk[-1][-chunk_overlap:] if current_chunk else ""
            current_chunk = [overlap_text] if overlap_text else []
            current_length = len(overlap_text)

        # Add the new sentence
        current_chunk.append(sentence)
        current_length += sentence_length

    # Add the last chunk if it has content
    if current_chunk:
        chunks.append(" ".join(current_chunk).strip())

    return chunks


In [19]:
final = split_text_into_chunks(text)
for i in final:
    print(i)
    print("===============================")

Code of Conduct     
    
Policy Document     
    
1. Message from the Co -Founder & Director     
    
Our commitment to ethical behavior and the solid ethical foundations are one of the most 
essential components of EnFuse Solutions Ltd. operation. We are committed to doing 
business the right way, based on a culture of ethics and compliance. In the long term, we can successfully face the challenges of a competitive market 
environment by accepting the imperatives of moral responsibility, both as individuals and as a 
company. In performing the job duties, the employees should always act lawfully , ethically 
and in the best interests of the EnFuse Solutions Ltd. Thank you for upholding our values and helping us do things right. It does not only mean that 
we provide well made, fairly priced and of exceptional quality products and services, but it 
also means that ethics and integrity is always born in mind. We sourc e material only from 
suppliers who have impeccable human rights a

In [24]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Custom separators to prioritize sentence boundaries
separators = [". ", "? ", "! ", "\n", ", ", " ", ""]

# Define the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,  # Adjust based on your needs
    chunk_overlap=120,  # To ensure continuity between chunks
    length_function=len,
    separators=separators
)

# Perform text splitting
chunks = text_splitter.split_text(text)
print(chunks)
# Print the resulting chunks
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i}:\n{chunk}\n")


['Code of Conduct     \n    \nPolicy Document     \n    \n1.    Message from the Co -Founder & Director     \n    \nOur commitment to ethical behavior and the solid ethical foundations are one of the most \nessential components of EnFuse Solutions Ltd. operation. We are committed to doing \nbusiness the right way, based on a culture of ethics and compliance.     \nIn the long term, we can successfully face the challenges of a competitive market \nenvironment by accepting the imperatives of moral responsibility, both as individuals and as a \ncompany. In performing the job duties, the employees should always act lawfully , ethically \nand in the best interests of the EnFuse Solutions Ltd.      \nThank you for upholding our values and helping us do things right. It does not only mean that \nwe provide well made, fairly priced and of exceptional quality products and services, but it \nalso means that ethics and integrity is always born in mind. We sourc e material only from \nsuppliers wh