In [1]:
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from uuid import uuid4
from langchain_ollama.llms import OllamaLLM
from langchain_ollama import OllamaEmbeddings
from dotenv import load_dotenv
load_dotenv()

USER_AGENT environment variable not set, consider setting it to identify your requests.
  from tqdm.autonotebook import tqdm


True

In [2]:
AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
AZURE_OPENAI_DEPLOYMENT_ID = os.getenv('AZURE_OPENAI_DEPLOYMENT_ID')
AZURE_OPENAI_KEY = os.getenv('AZURE_OPENAI_KEY')
AZURE_API_VERSION = os.getenv('AZURE_API_VERSION')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=PINECONE_API_KEY)

In [3]:
llm = AzureChatOpenAI(
            azure_endpoint=AZURE_OPENAI_ENDPOINT,
            azure_deployment=AZURE_OPENAI_DEPLOYMENT_ID,
            api_version=AZURE_API_VERSION,
            api_key=AZURE_OPENAI_KEY,
            temperature=0.0,
            verbose=True,
        )

embedding_llm = AzureOpenAIEmbeddings(
            azure_endpoint=AZURE_OPENAI_ENDPOINT,
            azure_deployment='embedding-ada-crayon',
            api_key=AZURE_OPENAI_KEY,
            api_version=AZURE_API_VERSION,
        )

critic_llm = OllamaLLM(model="llama3.1")



embeddings = OllamaEmbeddings(model="llama3.1")

In [4]:
loader = PyPDFLoader(r"Dummy - CB Policy.pdf")
pages = loader.load_and_split()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
splits = text_splitter.split_documents(pages)

In [5]:
splits = splits[:2]

In [7]:
splits

[Document(metadata={'source': 'Dummy - CB Policy.pdf', 'page': 0}, page_content='1 \n \nOCBC Information Classification: Internal   \n \n \n \n \n \n \n \n \nCompensation & Benefits \nPolicy  \nPT CRAYON SHINCHAN  \n \nPolicy Effective Date: < 07/08/24> \n \n \n \n \n \n \nNo part of this documentation may be reproduced or transmitted in any form or by any means, electronic \nor mechanical, including photocopying or recording, for any purpose without express written permission \nof the CEO of PT CRAYON  SHINCHAN.  \n \n© 2021, <Company Name Here>. All Rights Reserved'),
 Document(metadata={'source': 'Dummy - CB Policy.pdf', 'page': 1}, page_content='2 \n \nOCBC Information Classification: Internal   \nRevision History  \n  \nVer \nNo. Change \nDescription  Prepared \nBy Reviewed By  Approved \nBy Date  \n      \n  \n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nTABLE OF CONTENTS  \n \n \n \n \n \n \nObjective  4')]

In [8]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

generator = TestsetGenerator.from_langchain(
    generator_llm=critic_llm,
    critic_llm=critic_llm,
    embeddings=embeddings
)
testset = generator.generate_with_langchain_docs(splits, test_size=2, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

Filename and doc_id are the same for all nodes.               
Generating: 100%|██████████| 2/2 [26:11<00:00, 785.50s/it] 


In [9]:
test_splits = testset.to_pandas()

In [10]:
test_splits

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,Here is a question that can be fully answered ...,[1 \n \nOCBC Information Classification: Inter...,The CEO of PT CRAYON SHINCHAN,simple,"[{'source': 'Dummy - CB Policy.pdf', 'page': 0}]",True
1,Here is a question that can be fully answered ...,[1 \n \nOCBC Information Classification: Inter...,No part of this documentation may be reproduce...,simple,"[{'source': 'Dummy - CB Policy.pdf', 'page': 0}]",True


In [27]:
test_pages

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What actions will be taken if a complaint is s...,[9 \n \nOCBC Information Classification: Inter...,The organisation shall take necessary discipli...,simple,"[{'source': 'Dummy - CB Policy.pdf', 'page': 8}]",True
1,What is the principle of equal pay for equal w...,[4 \n \nOCBC Information Classification: Inter...,The principle of equal pay for equal work in t...,simple,"[{'source': 'Dummy - CB Policy.pdf', 'page': 3}]",True
2,What is the minimum base amount used for calcu...,[8 \n \nOCBC Information Classification: Inter...,The minimum base amount for calculating GPSSA ...,simple,"[{'source': 'Dummy - CB Policy.pdf', 'page': 7}]",True
3,What are the variable benefits included in the...,[5 \n \nOCBC Information Classification: Inter...,The variable benefits included in the compensa...,simple,"[{'source': 'Dummy - CB Policy.pdf', 'page': 4}]",True
4,What is the eligibility criteria for paid leav...,[7 \n \nOCBC Information Classification: Inter...,The eligibility criteria for paid leave for em...,simple,"[{'source': 'Dummy - CB Policy.pdf', 'page': 6}]",True
5,What are the End of Service Benefits provided ...,[6 \n \nOCBC Information Classification: Inter...,End of Service Benefits provided by the organi...,reasoning,"[{'source': 'Dummy - CB Policy.pdf', 'page': 5}]",True
6,What are the specific benefits under Mandatory...,[5 \n \nOCBC Information Classification: Inter...,The specific benefits under Mandatory Benefits...,reasoning,"[{'source': 'Dummy - CB Policy.pdf', 'page': 4}]",True
7,What is the purpose of the Compensation and Be...,[4 \n \nOCBC Information Classification: Inter...,The purpose of the Compensation and Benefits P...,multi_context,"[{'source': 'Dummy - CB Policy.pdf', 'page': 3...",True
