In [1]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

## Loading

In [2]:
loader = PyPDFDirectoryLoader("pdfs")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = loader.load_and_split(text_splitter)# RecursiveCharacterTextSplitter is used by default

In [3]:
len(documents)

61

# Indexing

In [4]:
import os
from langchain_elasticsearch import ElasticsearchStore
from langchain_openai import OpenAIEmbeddings
from langchain_elasticsearch import ElasticsearchStore
from langchain_elasticsearch.vectorstores import ElasticsearchStore
from langchain_openai import OpenAIEmbeddings
from langchain_elasticsearch.embeddings import ElasticsearchEmbeddings

In [5]:
os.environ['OPENAI_API_KEY'] = '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' 

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
db = ElasticsearchStore.from_documents(
    documents=documents,
    embedding=embeddings_model,
    es_url="http://localhost:9201",
    index_name="test_index_v2"
)
db.client.indices.refresh(index="test_index_v2")

ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})

In [6]:
# Retreiver
retriever = db.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5, "k":3}
)
docs = retriever.get_relevant_documents("Explain to me how can I refer someone?")
len(docs)

3

In [7]:
docs

[Document(page_content="To\nencourage\nemployee\nreferrals,\nSimpplr\noffers\na\nreferral\nbonus\nto\neligible\nemployees\nwho\nrefer\ncandidates\nresulting\nin\nsuccessful\nhires.\nEmployees,\nexcept\nthose\nin\nthe\nHR\ndepartment\nor\nin\na\nhiring\nrole,\nare\neligible\nfor\nthe\nreferral\nbonus.\n5.2\nReferral\nProcess\nEmployees\ncan\nrefer\ncandidates\nby\nsubmitting\nan\nemployee\nreferral\nform,\nproviding\npertinent\ninformation\nabout\nthe\ncandidate.\nThe\nform\nshould\nbe\nsubmitted\nbefore\nor\nat\nthe\nsame\ntime\nas\nthe\ncandidate's\napplication.\n5.3\nBonus\nPayout\nThe\nreferral\nbonus\nis\npaid\nout\nin\ntwo\nparts\n-\nhalf\nupon\nthe\ncandidate's\nsuccessful\nonboarding\nand\nthe\nremaining\nhalf\nafter\nthe\nreferred\nemployee\ncompletes\nsix\nmonths\nof\ncontinuous\nemployment.\nThe\nbonus\namount\nmay\nvary,\ndepending\non\nthe\nlevel\nand\ncriticality\nof\nthe\nposition.\n6.\nHiring\nof\nRelatives\nWhile\nSimpplr\nvalues\ndiversity\nand\nequal\nopportunity,\nwe

## Generation

In [11]:
from langchain_core.runnables import RunnableParallel
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain import hub


llm = ChatOpenAI(model="gpt-3.5-turbo-0125")
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [12]:
output = rag_chain_with_source.invoke("Explain to me parental leave?")

In [13]:
output

{'context': [Document(page_content='Title:\nParental\nLeave\nPolicy\nIntroduction:\nAt\nSimpplr,\nwe\nbelieve\nin\nfostering\na\nfamily-friendly\nwork\nenvironment\nand\nsupporting\nour\nemployees\nin\nthe\nvarious\nstages\nof\ntheir\npersonal\nlives.\nThis\nParental\nLeave\nPolicy\noutlines\nthe\nprovisions\nand\nguidelines\nfor\nemployees\nwho\nare\nexpecting\nor\nadopting\na\nchild.\nOur\ngoal\nis\nto\nensure\nthat\nboth\nparents\nhave\nthe\nopportunity\nto\nbond\nwith\ntheir\nnewborn\nor\nnewly\nadopted\nchild,\nwhile\nalso\nmaintaining\njob\nsecurity\nand\npromoting\nwork-life\nbalance.\nPolicy\nStatement:\nSimpplr\nrecognizes\nthe\nimportance\nof\nparental\nbonding\nand\nencourages\nemployees\nto\ntake\nadvantage\nof\nparental\nleave\nfollowing\nthe\nbirth,\nadoption,\nor\nfoster\ncare\nplacement\nof\na\nchild.\nThis\npolicy\nprovides\neligible\nemployees\nwith\npaid\nand\nunpaid\nleave\noptions\nto\nsupport\nthem\nduring\nthis\nsignificant\nlife\nevent.\nThis\npolicy\napplies\nt

In [14]:
# Assuming 'output' contains your original data structured with Document objects
def beautify_output(output):
    response = "Response:\n" + output['answer'] + "\n\n"
    sources = "Sources:\n"
    
    added_sources = set()
    for i, doc in enumerate(output['context'], start=1):
        # Accessing attributes directly; adjust these as necessary for your Document class
        source_info = f"Source{i} ({doc.metadata['source']}, Page {doc.metadata['page']}): "
        page_content = doc.page_content.replace("\n", " ").strip()
        if source_info not in added_sources:
            sources += source_info + page_content + "\n"
            added_sources.add(source_info)
    
    return response + sources

# Make sure 'output' is defined as shown in your example before calling this
formatted_output = beautify_output(output)
print(formatted_output)


Response:
Parental leave is a policy that allows employees who are expecting or adopting a child to take paid or unpaid time off work to bond with their new child. At Simpplr, this policy applies to full-time and part-time employees who have been employed for at least six months continuously and wish to return to work after the leave period is over. It includes maternity leave for birthing mothers, paternity leave for fathers and non-birthing parents, and adoption/foster care leave for those who adopt or foster a child, with paid and unpaid options available.

Sources:
Source1 (pdfs/GPT-parental leave policy.pdf, Page 0): Title: Parental Leave Policy Introduction: At Simpplr, we believe in fostering a family-friendly work environment and supporting our employees in the various stages of their personal lives. This Parental Leave Policy outlines the provisions and guidelines for employees who are expecting or adopting a child. Our goal is to ensure that both parents have the opportunity 

# Evaluation

In [17]:
documents

[Document(page_content='Title:\nPersonal\nand\nPaid\nLeave\nPolicy\nIntroduction:\nAt\nSimpplr,\nwe\nprioritize\nthe\nwell-being\nand\nwork-life\nbalance\nof\nour\nemployees.\nThe\nPersonal\nand\nPaid\nLeave\nPolicy\naims\nto\ncreate\nan\ninclusive\nand\nsupportive\nwork\nenvironment\nthat\nallows\nemployees\nto\ntake\ntime\noff\nfor\npersonal\nreasons,\nsuch\nas\nillness,\nfamily\nemergencies,\nor\npersonal\nappointments,\nwhile\nensuring\nthey\nare\ncompensated\nduring\ntheir\nabsence.\nThis\npolicy\noutlines\nthe\nprovisions\nand\nguidelines\nfor\npersonal\nand\npaid\nleaves,\neligibility\nrequirements,\nthe\napplication\nprocess,\nand\nthe\nbenefits\nemployees\ncan\nexpect\nwhile\non\nleave.\nPolicy\nStatement:\nSimpplr\nacknowledges\nthe\nimportance\nof\npersonal\ntime\nand\nunderstands\nthat\nemployees\nmay\nneed\nto\ntake\ntime\noff\nfor\nvarious\npersonal\nreasons.\nThis\npolicy\nensures\nthat\neligible\nemployees\nhave\nthe\nopportunity\nto\nbalance\ntheir\npersonal\nand\nprof

In [16]:
for document in documents:
    document.metadata['filename'] = document.metadata['source']

In [18]:
# Data generation
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# generator with openai models
generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-3.5-turbo")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# generate testset
testset = generator.generate_with_langchain_docs(documents, test_size=30, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

  from .autonotebook import tqdm as notebook_tqdm
Generating: 100%|██████████| 31/31 [03:28<00:00,  6.72s/it]


In [34]:
df = testset.to_pandas()
df

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is the importance of maintaining a regula...,[a.\nRemote\nemployees\nare\nrequired\nto\nmee...,Remote employees should maintain a regular wor...,simple,[{'source': 'pdfs/GPT- Remote Work Policy.pdf'...,True
1,What is the purpose of granting employee stock...,[Employee\nStock\nOptions\nPolicy\n1.\nIntrodu...,The purpose of granting employee stock options...,simple,"[{'source': 'pdfs/GPT- ESOPS policy.pdf', 'pag...",True
2,What is the purpose of setting maximum reimbur...,[To\nmaintain\ncost\ncontrol\nand\nprevent\nab...,To maintain cost control and prevent abuse,simple,[{'source': 'pdfs/GPT - Expense Reimbursement ...,True
3,What is the process for requesting and returni...,[Leave\nProcess\nand\nDocumentation:\n1.\nNoti...,Employees seeking parental leave must notify t...,simple,[{'source': 'pdfs/GPT-parental leave policy.pd...,True
4,How does Simpplr handle compensation for emplo...,[matters\nwhile\nbeing\ncompensated\nduring\nt...,Simpplr handles compensation for employees dur...,simple,"[{'source': 'pdfs/GPT - leave policy.pdf', 'pa...",True
5,What is the disclosure requirement for employe...,"[values\ndiversity\nand\nequal\nopportunity,\n...",Employees who have a family relationship with ...,simple,[{'source': 'pdfs/GPT- Recruitment and Onboard...,True
6,What is the role of the Human Resources depart...,[and\nthe\ncompany\nwhile\nmaintaining\nthe\nh...,Full-time remote work arrangements should be r...,simple,[{'source': 'pdfs/GPT- Remote Work Policy.pdf'...,True
7,What are the expectations for remote employees...,[their\nsupervisor\nand\nthe\nHuman\nResources...,Remote employees are required to meet the same...,simple,[{'source': 'pdfs/GPT- Remote Work Policy.pdf'...,True
8,How does the implementation of this policy pro...,[It\naims\nto\nprovide\na\nframework\nthat\npr...,The implementation of this policy promotes fai...,simple,[{'source': 'pdfs/GPT - grievance and discipli...,True
9,What is the vesting schedule for stock options...,[and\neligible\nemployees\nmust\nindicate\nthe...,The vesting schedule for stock options granted...,simple,"[{'source': 'pdfs/GPT- ESOPS policy.pdf', 'pag...",True


In [35]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)

In [36]:
dataset

Dataset({
    features: ['question', 'contexts', 'ground_truth', 'evolution_type', 'metadata', 'episode_done'],
    num_rows: 30
})

In [39]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

from ragas import evaluate

result = evaluate(
    dataset,
    metrics=[
        context_precision,
        context_recall,
    ],
)

result

Evaluating: 100%|██████████| 60/60 [00:52<00:00,  1.15it/s]


{'context_precision': 0.9000, 'context_recall': 0.9352}