In [None]:
%pip install -qqqU python-dotenv langchain langchain-openai ragas chromadb wandb tiktoken openai jq rapidfuzz jsonlines

In [None]:
from dotenv import load_dotenv

load_dotenv('../.env')

In [None]:
import json
from langchain.schema import Document
def load_documents(json_path: str = './data/regulamento-semantic.json'):
    documents = []
    
    # Abrindo e lendo o arquivo JSON
    with open(json_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line.strip())
            
            # Criando o documento LangChain
            doc = Document(
                page_content=data.get("page_content", ""),
                metadata=data.get("metadata", {})
            )
            documents.append(doc)
    
    return documents

docs = load_documents()

## Translating pt to english due RAGAS constraints

In [None]:
from langchain_openai import ChatOpenAI
from langchain.schema import Document

def translate_documents_to_english_langchain(documents, model_name="gpt-4o-mini"):
    llm = ChatOpenAI(model=model_name, temperature=0)

    translated_documents = []

    for doc in documents:
        if not isinstance(doc, Document):
            raise ValueError("Os documentos devem ser instâncias da classe Document.")
        
        # Mensagem para o modelo
        translation_prompt = f"Por favor, traduza o seguinte texto para inglês:\n\n{doc.page_content}"

        # Realiza a tradução
        try:
            translated_text = llm.predict(translation_prompt)
            # Cria um novo documento traduzido
            translated_doc = Document(page_content=translated_text, metadata=doc.metadata)
            translated_documents.append(translated_doc)
        except Exception as e:
            print(f"Erro ao traduzir o documento: {e}")
            # Preserva o documento original em caso de erro
            translated_documents.append(Document(page_content=doc.page_content, metadata={**doc}))

    return translated_documents


translated_docs = translate_documents_to_english_langchain(docs)

## Load/Save Dataset

In [None]:
import typing as t
import jsonlines
from langchain.schema import Document


def save_docs_to_jsonl(documents: t.Iterable[Document], file_path: str) -> None:
    with jsonlines.open(file_path, mode="w") as writer:
        for doc in documents:
            writer.write(doc.dict())


def load_docs_from_jsonl(file_path) -> t.Iterable[Document]:
    documents = []
    with jsonlines.open(file_path, mode="r") as reader:
        for doc in reader:
            documents.append(Document(**doc))
    return documents

# save_docs_to_jsonl(translated_docs, '../regulamento-docs-eng.jsonl')

In [None]:
translated_docs = load_docs_from_jsonl('../regulamento-docs-eng.jsonl')

## Setting Personas

In [None]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_core.rate_limiters import InMemoryRateLimiter

rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.2,  # <-- Can only make a request once every 10 seconds!!
    check_every_n_seconds=0.1,  # Wake up every 100 ms to check whether allowed to make a request,
    max_bucket_size=10,  # Controls the maximum burst size.
)


generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o", rate_limiter=rate_limiter))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [None]:
from ragas.testset.persona import Persona

personas = [
    Persona(
        name="student",
        role_description="An undergraduate student at UFRN who has questions about academic rules and regulations with no context about it, do not mention articles directly ",
    ),
]

In [None]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings, persona_list=personas)
dataset = generator.generate_with_langchain_docs(translated_docs, testset_size=300)

In [None]:
df = dataset.to_pandas()
df.to_csv("../ragas_openai_gpt4o-en.csv")
df.head()

## Translating english to portuguese

In [None]:
import pandas as pd
from langchain_openai import ChatOpenAI

def translate_ragas_dataframe_to_portuguese(df, model_name="gpt-4o-mini"):

    # Inicializa o modelo OpenAI via LangChain
    llm = ChatOpenAI(model=model_name, temperature=0)

    # Função auxiliar para traduzir texto
    def translate_text(text, column_name):
        try:
            # Monta o prompt para tradução
            prompt = f"Translate the following text to Portuguese:\n\n{text}"
            result = llm.predict(prompt)
            return result
        except Exception as e:
            print(f"Erro ao traduzir a coluna {column_name}: {e}")
            return text  # Retorna o texto original em caso de erro

    # Cria cópia do DataFrame para evitar alterações no original
    translated_df = df.copy()

    # Tradução das colunas
    for column in ['user_input', 'reference_contexts', 'reference']:
        if column in df.columns:
            translated_df[column] = df[column].apply(lambda x: translate_text(x, column))
        else:
            print(f"A coluna {column} não foi encontrada no DataFrame.")

    return translated_df

translated_df = translate_ragas_dataframe_to_portuguese(df)

In [None]:
translated_df.to_csv('../ragas_openai_gpt4o-pt.csv')
translated_df.head()