In [None]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

## Loading

In [None]:
loader = PyPDFDirectoryLoader("/pdfs")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
documents = loader.load_and_split(text_splitter)# RecursiveCharacterTextSplitter is used by default

In [None]:
len(documents)

# Indexing

In [None]:
import os
from langchain_elasticsearch import ElasticsearchStore
from langchain_openai import OpenAIEmbeddings
from langchain_elasticsearch import ElasticsearchStore
from langchain_elasticsearch.vectorstores import ElasticsearchStore
from langchain_openai import OpenAIEmbeddings
from langchain_elasticsearch.embeddings import ElasticsearchEmbeddings

In [None]:
os.environ['OPENAI_API_KEY'] = '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' 

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")
db = ElasticsearchStore.from_documents(
    documents=documents,
    embedding=embeddings_model,
    es_url="http://localhost:9201",
    index_name="test_index_v2"
)
db.client.indices.refresh(index="test_index_v2")

In [None]:
# Retreiver
retriever = db.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5, "k":3}
)
docs = retriever.get_relevant_documents("Explain to me how can I refer someone?")
len(docs)

In [None]:
docs

# Generation

In [None]:
from langchain_core.runnables import RunnableParallel
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain import hub


llm = ChatOpenAI(model="gpt-3.5-turbo-0125")
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [None]:
output = rag_chain_with_source.invoke("Explain to me parental leave?")

In [None]:
output

In [None]:
# 'output' contains original data structured with Document objects
def beautify_output(output):
    response = "Response:\n" + output['answer'] + "\n\n"
    sources = "Sources:\n"
    
    added_sources = set()
    for i, doc in enumerate(output['context'], start=1):
        source_info = f"Source{i} ({doc.metadata['source']}, Page {doc.metadata['page']}): "
        page_content = doc.page_content.replace("\n", " ").strip()
        if source_info not in added_sources:
            sources += source_info + page_content + "\n"
            added_sources.add(source_info)
    
    return response + sources
formatted_output = beautify_output(output)
print(formatted_output)


# Evaluation

In [None]:
documents

In [None]:
for document in documents:
    document.metadata['filename'] = document.metadata['source']

In [None]:
# Data generation
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# generator with openai models
generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-3.5-turbo")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# generate testset
testset = generator.generate_with_langchain_docs(documents, test_size=30, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

In [None]:
df = testset.to_pandas()
df

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)

In [None]:
dataset

In [None]:
from ragas.metrics import (
    answer_relevancy, # this is similar to llamaindex relevancy evalutor
    faithfulness,
    context_recall,
    context_precision,
)

from ragas import evaluate

result = evaluate(
    dataset,
    metrics=[
        context_precision,
        context_recall,
    ],
)

result