### Loading Libraries

In [21]:
import os
import pandas as pd
from dotenv import load_dotenv

from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator

from langchain_community.document_loaders import DirectoryLoader,DataFrameLoader

load_dotenv(override=True) # take environment variables from .env.

True

### Loading Processed data

In [22]:
path = "data/processed/files/"
filelist = os.listdir(path)

combined_df = pd.DataFrame()
for filename in filelist:
    single_df = pd.read_parquet(path + filename)
    combined_df = pd.concat([combined_df, single_df])
combined_df =combined_df.reset_index(drop=True)

combined_df['text'] = "Title: "+ combined_df['title'].astype(str) + "\n" + "Content: "+ combined_df['content'].astype(str) 
combined_df = combined_df.drop(columns = ["title","content","title_vector","content_vector","preprocessing_pipeline"])

combined_df = combined_df.sample(500).reset_index(drop=True)

### Loading environment variables

In [23]:
os.environ["AZURE_OPENAI_API_KEY"] = os.environ["AZURE_OPENAI_API_KEY"]

# other configuration
azure_configs = {
    "base_url":os.environ["AZURE_OPENAI_ENDPOINT"],  # your endpoint
    "model_deployment": "gpt-4o",  # your model deployment name
    "model_name": "gpt-4o",  # your model name
    "embedding_deployment": "text-embedding-3-small",  # your embedding deployment name
    "embedding_name": "text-embedding-3-small",  # your embedding name
}

### Creating generator

In [24]:

generator_llm = LangchainLLMWrapper(AzureChatOpenAI(
    openai_api_version="2024-10-21",
    azure_endpoint=azure_configs["base_url"],
    azure_deployment=azure_configs["model_deployment"],
    model=azure_configs["model_name"],
    validate_base_url=False,
))

# init the embeddings for answer_relevancy, answer_correctness and answer_similarity
generator_embeddings = LangchainEmbeddingsWrapper(AzureOpenAIEmbeddings(
    openai_api_version="2024-10-21",
    azure_endpoint=azure_configs["base_url"],
    azure_deployment=azure_configs["embedding_deployment"],
    model=azure_configs["embedding_name"],
))

### Running generator

In [25]:
loader = DataFrameLoader(combined_df)
docs = loader.load()

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=100)

Applying HeadlinesExtractor:   0%|          | 0/440 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/500 [00:00<?, ?it/s]

unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to ap

Applying SummaryExtractor:   0%|          | 0/727 [00:00<?, ?it/s]

Property 'summary' already exists in node '65afc4'. Skipping!
Property 'summary' already exists in node 'f14cbe'. Skipping!
Property 'summary' already exists in node '3caf85'. Skipping!
Property 'summary' already exists in node 'abbd50'. Skipping!
Property 'summary' already exists in node 'ea683f'. Skipping!
Property 'summary' already exists in node 'ade532'. Skipping!
Property 'summary' already exists in node 'd6a911'. Skipping!
Property 'summary' already exists in node '58e512'. Skipping!
Property 'summary' already exists in node 'cf9659'. Skipping!
Property 'summary' already exists in node '02e2b3'. Skipping!
Property 'summary' already exists in node '5b06d8'. Skipping!
Property 'summary' already exists in node 'd45f57'. Skipping!
Property 'summary' already exists in node 'f23f0d'. Skipping!
Property 'summary' already exists in node '0b6e6e'. Skipping!
Property 'summary' already exists in node 'e1314f'. Skipping!
Property 'summary' already exists in node '3e77ea'. Skipping!
Property

Applying CustomNodeFilter:   0%|          | 0/320 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/1253 [00:00<?, ?it/s]

Property 'summary_embedding' already exists in node '65afc4'. Skipping!
Property 'summary_embedding' already exists in node 'f14cbe'. Skipping!
Property 'summary_embedding' already exists in node 'ade532'. Skipping!
Property 'summary_embedding' already exists in node 'ea683f'. Skipping!
Property 'summary_embedding' already exists in node '99074c'. Skipping!
Property 'summary_embedding' already exists in node '0b6e6e'. Skipping!
Property 'summary_embedding' already exists in node 'd6a911'. Skipping!
Property 'summary_embedding' already exists in node '3caf85'. Skipping!
Property 'summary_embedding' already exists in node 'cf9659'. Skipping!
Property 'summary_embedding' already exists in node 'e1314f'. Skipping!
Property 'summary_embedding' already exists in node 'abbd50'. Skipping!
Property 'summary_embedding' already exists in node '02e2b3'. Skipping!
Property 'summary_embedding' already exists in node '58e512'. Skipping!
Property 'summary_embedding' already exists in node 'f2ae8c'. Sk

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/102 [00:00<?, ?it/s]

In [26]:
output_dataset = dataset.to_pandas()

In [28]:
output_dataset

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What sustainability initiatives has Alphabet I...,[Ongoing Commitment to Sustainability Content:...,Alphabet Inc. has implemented several sustaina...,single_hop_specifc_query_synthesizer
1,Why insurance not always enough for cover prod...,"[Title: Claims, Litigation, Government Investi...","Although liability insurance is maintained, it...",single_hop_specifc_query_synthesizer
2,What tax challenges do online marketplaces face?,[We Face Additional Tax Liabilities and Collec...,Online marketplaces face tax challenges such a...,single_hop_specifc_query_synthesizer
3,How is interest for Base Rate Loans computed w...,[Title: 2.08 Computation of Interest and Fees ...,"Interest for Base Rate Loans, when the Base Ra...",single_hop_specifc_query_synthesizer
4,What is the role of the Administrative Agent's...,[2.10 Payments Generally; Administrative Agent...,The Administrative Agent's Office is responsib...,single_hop_specifc_query_synthesizer
...,...,...,...,...
97,Wht is the role of Form 8-K in disclosing Tran...,[<1-hop>\n\nTransfer Restriction Agreements (d...,Form 8-K plays a role in disclosing updates su...,multi_hop_specific_query_synthesizer
98,How does the adoption of Azure IoT Edge contri...,[<1-hop>\n\nOur increasing focus on cloud-base...,The adoption of Azure IoT Edge is integral to ...,multi_hop_specific_query_synthesizer
99,How do the proceeds from debt issuances and th...,[<1-hop>\n\nDebt Proceeds We issue debt to tak...,The proceeds from debt issuances are used for ...,multi_hop_specific_query_synthesizer
100,How does the non-transferability of SAs and th...,[<1-hop>\n\nNon-Transferability of SAs Content...,"The non-transferability of SAs, as outlined in...",multi_hop_specific_query_synthesizer
