# Prototyping Notebook

Prototyping for data loading, retrievals and evals



In [None]:

from dotenv import load_dotenv
import os
from uuid import uuid4

load_dotenv()

assert os.getenv("OPENAI_API_KEY"), "Missing OPENAI_API_KEY"
assert os.getenv("TAVILY_API_KEY"), "Missing TAVILY_API_KEY"
assert os.getenv("LANGCHAIN_API_KEY"), "Missing LANGCHAIN_API_KEY"
assert os.getenv("COHERE_API_KEY"), "Missing COHERE_API_KEY"

langsmith_api_key = os.getenv("LANGCHAIN_API_KEY")

os.environ["LANGCHAIN_TRACING_V2"] = "true"
langsmith_project_name = "AIM-CERT-LANGGRAPH-BASELINE"
os.environ["LANGCHAIN_PROJECT"] = langsmith_project_name
print(langsmith_project_name)

MODE = "PIPE"

from langgraph_agent import LangGraphAgent, RetrievalEnums

Agent = LangGraphAgent(retriever_mode=RetrievalEnums.NAIVE, 
                       MODE=MODE, 
                       langchain_project_name= langsmith_project_name)


# Load Eval Golden Dataset

In [None]:
from pathlib import Path
import pickle

with open(f"eval_golden_dataset.pkl", "rb") as f:
    golden_dataset = pickle.load(f)

# Evaluations

In [None]:

import copy
import pickle
import os
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator
from ragas import EvaluationDataset
from ragas import evaluate as evaluate_ragas, RunConfig
from ragas.metrics import (
    LLMContextPrecisionWithReference,
    LLMContextRecall,
    ContextEntityRecall,
    Faithfulness,
    ResponseRelevancy
    )
from langsmith.evaluation import LangChainStringEvaluator, evaluate as evaluate_langsmith
from langsmith import Client as ClientLangSmith
import numpy as np
from retrievers import get_retrieval_chains_and_wrappers_for_evals
import pickle
import prompts
import importlib
importlib.reload(prompts)
from prompts import get_rag_prompt

skip_evaluation = []

eval_full_results = {}
eval_summary_results = {}
eval_langsmith_raw_results = {}
eval_langsmith_summary_results = {}


# Prepare dataset for this chain
golden_dataset_active_copy = copy.deepcopy(golden_dataset)

print(f"data type dataset: {type(golden_dataset_active_copy)}")

# Generate responses for evaluation
for test_row in golden_dataset_active_copy:
    #response = wrappers[chain_name](test_row.eval_sample.user_input)
    response = Agent(test_row.eval_sample.user_input)
    Agent.reset_memroy()
    test_row.eval_sample.response = response["response"].content

    # this needs to change
    test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]


evaluation_active_dataset = EvaluationDataset.from_pandas(golden_dataset_active_copy.to_pandas())
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini"))

custom_run_config = RunConfig(timeout=360)

# Run evaluation using only RETRIEVAL metrics
print("Running RAGAS evaluation...")
eval_result_active = evaluate_ragas(
    dataset=evaluation_active_dataset,
    metrics=[LLMContextPrecisionWithReference(), LLMContextRecall(), ContextEntityRecall(), Faithfulness(), ResponseRelevancy()],
    llm=evaluator_llm,
    run_config=custom_run_config
)

eval_full_resultsc= eval_result_active

# Convert to DataFrames and compute statistics
df_eval_result_active = eval_result_active.to_pandas()
eval_result_active_means = df_eval_result_active.mean(numeric_only=True)
eval_result_active_stds = df_eval_result_active.std(numeric_only=True)

eval_summary_results = {
    "means": eval_result_active_means,
    "stds": eval_result_active_stds
}

print(f"RAGAS Results for pipeline with baseline retrieval:")
print(eval_result_active_means)
print(eval_result_active_stds)

# save results to pickle in case of errors or crashes
pickle_dict_active_ragas_results = { 
    "raw_results": eval_result_active, 
    "summary_results": eval_summary_results }

# Save to file
with open(f"cert_eval_ragas_results_for_baseline_retrieval.pkl", "wb") as f:
    pickle.dump(pickle_dict_active_ragas_results, f)


Visualizations

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Create heatmap visualization
def create_heatmap_visualization(eval_summary_results):
    """
    Create a heatmap showing metric performance across chains
    """
    # Prepare data for heatmap
    heatmap_data = []
    chains = list(eval_summary_results.keys())
    metrics = ['llm_context_precision_with_reference', 'context_recall', 'context_entity_recall', 'faithfulness', 'response_relevancy']
    
    for chain in chains:
        means = eval_summary_results[chain]["means"]
        row_data = [means[metric] for metric in metrics]
        heatmap_data.append(row_data)
    
    # Create DataFrame for heatmap
    df_heatmap = pd.DataFrame(
        heatmap_data, 
        index=[chain.replace('_', ' ').title() for chain in chains],
        columns=['Context Precision', 'Context Recall', 'Entity Recall', 'Faithfulness', 'Response Relevancy']
    )
    
    # Create the heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(df_heatmap, 
                annot=True, 
                cmap='RdYlGn',
                fmt='.3f',
                linewidths=0.5,
                cbar_kws={'label': 'Score'},
                square=True)
    
    plt.title('RAG Evaluation Metrics Heatmap\n(Darker = Better Performance)', 
              fontsize=14, fontweight='bold', pad=20)
    plt.xlabel('Metrics', fontsize=12, fontweight='bold')
    plt.ylabel('Retrieval Strategies', fontsize=12, fontweight='bold')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()
    
    return df_heatmap

# Generate alternative visualizations
print("\nCreating heatmap visualization...")
heatmap_df = create_heatmap_visualization(eval_summary_results)