### Loading Libraries

In [1]:
import os
import pandas as pd
from dotenv import load_dotenv

from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator

from langchain_community.document_loaders import DirectoryLoader,DataFrameLoader

load_dotenv(override=True) # take environment variables from .env.

True

### Loading Processed data

In [2]:
path = "data/processed/files/"
filelist = os.listdir(path)

combined_df = pd.DataFrame()
for filename in filelist:
    single_df = pd.read_parquet(path + filename)
    combined_df = pd.concat([combined_df, single_df])
combined_df =combined_df.reset_index(drop=True)

combined_df['text'] = "Title: "+ combined_df['title'].astype(str) + "\n" + "Content: "+ combined_df['content'].astype(str) 
combined_df = combined_df.drop(columns = ["title","content","title_vector","content_vector","preprocessing_pipeline"])

combined_df = combined_df.sample(400).reset_index(drop=True)

### Loading environment variables

In [3]:
os.environ["AZURE_OPENAI_API_KEY"] = os.environ["AZURE_OPENAI_API_KEY"]

# other configuration
azure_configs = {
    "base_url":os.environ["AZURE_OPENAI_ENDPOINT"],  # your endpoint
    "model_deployment": "gpt-4o",  # your model deployment name
    "model_name": "gpt-4o",  # your model name
    "embedding_deployment": "text-embedding-3-small",  # your embedding deployment name
    "embedding_name": "text-embedding-3-small",  # your embedding name
}

### Creating generator

In [4]:

generator_llm = LangchainLLMWrapper(AzureChatOpenAI(
    openai_api_version="2024-10-21",
    azure_endpoint=azure_configs["base_url"],
    azure_deployment=azure_configs["model_deployment"],
    model=azure_configs["model_name"],
    validate_base_url=False,
))

# init the embeddings for answer_relevancy, answer_correctness and answer_similarity
generator_embeddings = LangchainEmbeddingsWrapper(AzureOpenAIEmbeddings(
    openai_api_version="2024-10-21",
    azure_endpoint=azure_configs["base_url"],
    azure_deployment=azure_configs["embedding_deployment"],
    model=azure_configs["embedding_name"],
))

### Running generator

In [None]:
loader = DataFrameLoader(combined_df)
docs = loader.load()

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=50)

Applying HeadlinesExtractor:   0%|          | 0/342 [00:00<?, ?it/s]