In [None]:
%pip install Jinja2

In [None]:
from dotenv import load_dotenv

load_dotenv('../.env')

In [None]:
import os
from tqdm import tqdm
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from langchain_aws import BedrockEmbeddings, ChatBedrock
from qdrant_client.http.models import Distance, VectorParams
from langchain_core.rate_limiters import InMemoryRateLimiter
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import BaseMessage

In [None]:
from time import sleep
import random

def invoke_llm_with_backoff(llm: ChatBedrock, prompt, max_retries=5):
    retries = 0
    while retries < max_retries:
        try:
            return llm.invoke(prompt)
        except Exception as exc:
            print(exc)
            retries += 1
            wait_time = random.uniform(2 ** retries, 2 ** retries + 5)  # Exponential backoff
            print(f"Throttling error. Retrying in {wait_time:.2f} seconds...")
            sleep(wait_time)

    raise Exception("Max retries reached, could not invoke model.")

from datasets import Dataset

def pandas_to_ragas(df):
    # Ensure all text columns are strings and handle NaN values
    text_columns = df.columns
    for col in text_columns:
        df[col] = df[col].fillna('').astype(str)
        
    # Convert 'contexts' to a list of lists
    df['reference_contexts'] = df['reference_contexts'].fillna('').astype(str).apply(eval)
    df['retrieved_contexts'] = df['retrieved_contexts'].fillna('').astype(str).apply(eval)
    
    # Converting the DataFrame to a dictionary
    data_dict = df.to_dict('list')
    
    # Loading the dictionary as a Hugging Face dataset
    ragas_testset = Dataset.from_dict(data_dict)
    
    return ragas_testset

In [None]:
rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.07,  # <-- Super slow! We can only make a request once every 10 seconds!!
    check_every_n_seconds=0.1,  # Wake up every 100 ms to check whether allowed to make a request,
    # max_bucket_size=10,  # Controls the maximum burst size.
)

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [None]:
embeddings = BedrockEmbeddings(
    region_name='us-east-1',
    )

In [None]:
client = QdrantClient(
    location=os.environ["VECTOR_STORE_URL"],
    api_key=os.environ["VECTOR_STORE_API_KEY"]
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name="regulamento-semantic",
    embedding=embeddings
)

In [None]:
prompt = """
Você é um assistente da UFRN responsável por instruir alunos sobre questões acadêmicas do regulamento dos cursos de graduação.
Você deve responder a resposta correta baseada na questão e contexto abaixo. Por favor, siga as instruções:

1. Pergunta: {pergunta}

2. Contexto: {contexto}

3. Instruções:
    - Analise cuidadosamente a questão e o contexto fornecido.
    - Formule uma resposta abrangente e precisa baseada apenas nas informações fornecidas no contexto.
    - Certifique-se de que sua resposta aborda diretamente a pergunta.
    - Inclua todas as informações relevantes do contexto, mas não adicione nenhum conhecimento externo.
    - Se o contexto não contiver informações suficientes para responder completamente à pergunta, declare isso claramente e forneça a melhor resposta parcial possível.
    - Use um tom formal e objetivo.
    - Responda somente o perguntando, evite apresentar a resposta com palavras ou frases introdutórias como "Resposta:".
"""

prompt_template = PromptTemplate.from_template(prompt)

In [None]:
import pandas as pd
from tqdm import tqdm
from ragas import evaluate
from ragas.metrics import (
    NonLLMContextRecall,
    NonLLMContextPrecisionWithReference,
    NoiseSensitivity,
    ResponseRelevancy,
    Faithfulness,
)
from time import time

models = [
    # 'amazon.nova-lite-v1:0',
    'amazon.nova-micro-v1:0',
    # 'meta.llama3-8b-instruct-v1:0',
    # 'mistral.mistral-7b-instruct-v0:2',
    # 'mistral.mixtral-8x7b-instruct-v0:1'
    ]

k_values = [60, 70, 80, 90, 100]

# all_result_df = pd.DataFrame([])

for model_name in models:
    for k in k_values:
        llm = ChatBedrock(
            model_id=model_name,
            temperature=.0,
            region='us-east-1',
            )

        df = pd.read_csv('../data/dataset_potiguana.csv')

        retrieved_contexts = []
        spent_time = []
        responses = []
        prompts = []
        input_tokens_list = []
        output_tokens_list = []
        total_tokens_list = []

        for _, row in tqdm(df.iterrows()):
            # Captura pergunta (linha)
            query = row['user_input']

            # Pesquisa contextos
            context_docs = vector_store.similarity_search(query, k=k)
            contexts = [c.page_content for c in context_docs]

            # Gera prompt
            formatted_contexts = '\n'.join(contexts)
            prompt = prompt_template.format(pergunta=query, contexto=formatted_contexts)

            # Pede resposta a LLM e calcula tempo
            start = time()
            response: BaseMessage = invoke_llm_with_backoff(llm=llm, prompt=prompt)
            llm_response_time = time() - start

            # Get response metadata
            input_tokens = response.usage_metadata['input_tokens']
            output_tokens = response.usage_metadata['output_tokens']
            total_tokens = response.usage_metadata['total_tokens']
            response_content = response.content

            input_tokens_list.append(input_tokens)
            output_tokens_list.append(output_tokens)
            total_tokens_list.append(total_tokens)
            retrieved_contexts.append(contexts)
            spent_time.append(llm_response_time)
            responses.append(response_content)
            prompts.append(prompt)

        df['retrieved_contexts'] = pd.Series(retrieved_contexts)
        df['response'] = pd.Series(responses)
        eval_dataset = pandas_to_ragas(df)

        metrics = [
            NonLLMContextPrecisionWithReference(threshold=0.95),
            NonLLMContextRecall(threshold=0.95),
            # NoiseSensitivity(llm=evaluator_llm),
            ResponseRelevancy(llm=evaluator_llm),
            Faithfulness(llm=evaluator_llm),
            ]

        results = evaluate(dataset=eval_dataset, metrics=metrics)
        result_df = results.to_pandas()

        result_df['k'] = k
        result_df['model'] = model_name
        result_df['llm_response_time'] = pd.Series(spent_time)
        result_df['prompt'] = pd.Series(prompts)
        result_df['input_tokens'] = pd.Series(input_tokens_list)
        result_df['output_tokens'] = pd.Series(output_tokens_list)
        result_df['total_tokens'] = pd.Series(total_tokens_list)

        all_result_df = pd.concat([all_result_df, result_df], ignore_index=True)

all_result_df.to_csv('../generation_in_k.csv')

In [None]:
len(all_result_df)

In [None]:
len(all_result_df[all_result_df['answer_relevancy'].notna()])

In [None]:
len(all_result_df[all_result_df['faithfulness'].notna()])

In [None]:
all_result_df.head(2)

In [None]:
import matplotlib.pyplot as plt

# Calcule a média de NonLLMContextRecall para cada valor de K
answer_relevancy = all_result_df.groupby('k')['answer_relevancy'].mean()
faithfulness = all_result_df.groupby('k')['faithfulness'].mean()


plt.figure(figsize=(10, 6))
plt.plot(answer_relevancy.index, answer_relevancy.values, marker='o', linestyle='-', color='b', label='Answer Relevancy')
plt.plot(faithfulness.index, faithfulness.values, marker='s', linestyle='--', color='r', label='Faithfulness')
plt.title('Mean Context Recall and Precision vs. K', fontsize=16)
plt.xlabel('K', fontsize=14)
plt.ylabel('Mean Score', fontsize=14)
plt.grid(True)
plt.legend(fontsize=12)

plt.xticks(answer_relevancy.index)  # Mostra todos os valores de K no eixo x
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Calcule a média de NonLLMContextRecall para cada valor de K
input_tokens = all_result_df.groupby('k')['input_tokens'].mean()
llm_response_time = all_result_df.groupby('k')['llm_response_time'].mean()


plt.figure(figsize=(10, 6))
plt.plot(input_tokens.index, input_tokens.values, marker='o', linestyle='-', color='b', label='Input Tokens')
plt.plot(llm_response_time.index, llm_response_time.values, marker='s', linestyle='--', color='r', label='Response Time (s)')
# plt.title('Mean Context Recall and Precision vs. K', fontsize=16)
plt.xlabel('K', fontsize=14)
plt.ylabel('Mean Score', fontsize=14)
plt.grid(True)
plt.legend(fontsize=12)

plt.xticks(input_tokens.index)  # Mostra todos os valores de K no eixo x
plt.show()

In [None]:
# Selecionando as colunas de interesse
columns_of_interest = ['input_tokens', 'llm_response_time']

# Agrupando por 'model' e calculando a média para as colunas selecionadas
average_values = all_result_df.groupby('k')[columns_of_interest].mean().reset_index()

# Exibindo a tabela resultante
average_values

In [None]:
import matplotlib.pyplot as plt

# Calcule a média de NonLLMContextRecall para cada valor de K
input_tokens = all_result_df.groupby('k')['llm_response_time'].mean()
# faithfulness = all_result_df.groupby('k')['faithfulness'].mean()


plt.figure(figsize=(10, 6))
plt.plot(input_tokens.index, input_tokens.values, marker='o', linestyle='-', color='b', label='Input Tokens')
# plt.plot(faithfulness.index, faithfulness.values, marker='s', linestyle='--', color='r', label='Faithfulness')
# plt.title('Mean Context Recall and Precision vs. K', fontsize=16)
plt.xlabel('K', fontsize=14)
plt.ylabel('Mean Score', fontsize=14)
plt.grid(True)
plt.legend(fontsize=12)

plt.xticks(answer_relevancy.index)  # Mostra todos os valores de K no eixo x
plt.show()

In [None]:
all_result_df['model'].unique()

In [None]:
all_result_df.head(2)

In [None]:
len(all_result_df)

In [None]:
len(all_result_df[all_result_df['faithfulness'].notna()])

In [None]:
len(all_result_df[all_result_df['answer_relevancy'].notna()])

In [None]:
len(all_result_df[all_result_df['response'].notna()])

In [None]:
all_result_df['faithfulness'].plot()

In [None]:
import pandas as pd

df = pd.read_csv('../generator_eval_v2.csv')
df.head(2)

In [None]:
# Selecionando as colunas de interesse
columns_of_interest = ['answer_relevancy', 'faithfulness', 'llm_response_time', 'output_tokens']

# Agrupando por 'model' e calculando a média para as colunas selecionadas
average_values = df.groupby('model')[columns_of_interest].mean().reset_index()

# Exibindo a tabela resultante
average_values

In [None]:
print(average_values.to_latex())