# Prototyping Notebook

Prototyping for data loading, retrievals and evals



In [1]:

from dotenv import load_dotenv
import os
from uuid import uuid4

load_dotenv()

assert os.getenv("OPENAI_API_KEY"), "Missing OPENAI_API_KEY"
assert os.getenv("TAVILY_API_KEY"), "Missing TAVILY_API_KEY"
assert os.getenv("LANGCHAIN_API_KEY"), "Missing LANGCHAIN_API_KEY"
assert os.getenv("COHERE_API_KEY"), "Missing COHERE_API_KEY"

langsmith_api_key = os.getenv("LANGCHAIN_API_KEY")

os.environ["LANGCHAIN_TRACING_V2"] = "true"
langsmith_project_name = "AIM-CERT-LANGGRAPH-BASELINE"
os.environ["LANGCHAIN_PROJECT"] = langsmith_project_name
print(langsmith_project_name)

MODE = "PIPE"

from langgraph_agent import LangGraphAgent, RetrievalEnums

Agent = LangGraphAgent(retriever_mode=RetrievalEnums.NAIVE, 
                       MODE=MODE, 
                       langchain_project_name= langsmith_project_name)


AIM-CERT-LANGGRAPH-BASELINE


# Load Eval Golden Dataset

In [2]:
from pathlib import Path
import pickle

with open(f"eval_golden_dataset.pkl", "rb") as f:
    golden_dataset = pickle.load(f)

# Evaluations

In [None]:

import copy
import pickle
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper
from ragas import EvaluationDataset
from ragas import evaluate as evaluate_ragas, RunConfig
from ragas.metrics import (
    LLMContextPrecisionWithReference,
    LLMContextRecall,
    ContextEntityRecall,
    Faithfulness,
    ResponseRelevancy
    )
import numpy as np
import pickle
import prompts
import importlib
importlib.reload(prompts)

skip_evaluation = []

eval_full_results = {}
eval_summary_results = {}
eval_langsmith_raw_results = {}
eval_langsmith_summary_results = {}

# Prepare dataset for this chain
golden_dataset_active_copy = copy.deepcopy(golden_dataset)

count = 0
# Generate responses for evaluation
for test_row in golden_dataset_active_copy:

    print(f"user input: {test_row.eval_sample.user_input}")
    
    response = await Agent.chat(test_row.eval_sample.user_input + "Use your RAG tool or web search tool to get context to asnwer my question")
    
    Agent.reset_longer_term_memory()

    test_row.eval_sample.response = response["response"]

    count += 1
    for source in ["rag", "search"]:
        for context in response["context"].get(source, []):
            if hasattr(context, "page_content"):
                print(f"{count} - {source} - type: {type(context)}")
            else:
                print(f"{count} {source} - type: {type(context)}")
                     
    # Merge context from RAG and search
    test_row.eval_sample.retrieved_contexts = [
        context.page_content if hasattr(context, "page_content") else context
        for source in ["rag", "search"]
        for context in response["context"].get(source, [])
    ]

evaluation_active_dataset = EvaluationDataset.from_pandas(golden_dataset_active_copy.to_pandas())
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini"))

custom_run_config = RunConfig(timeout=360)

# Run evaluation using only RETRIEVAL metrics
print("Running RAGAS evaluation...")
eval_result_active = evaluate_ragas(
    dataset=evaluation_active_dataset,
    metrics=[LLMContextPrecisionWithReference(), LLMContextRecall(), ContextEntityRecall(), Faithfulness(), ResponseRelevancy()],
    llm=evaluator_llm,
    run_config=custom_run_config
)

eval_full_resultsc= eval_result_active

# Convert to DataFrames and compute statistics
df_eval_result_active = eval_result_active.to_pandas()
eval_result_active_means = df_eval_result_active.mean(numeric_only=True)
eval_result_active_stds = df_eval_result_active.std(numeric_only=True)

eval_summary_results = {
    "means": eval_result_active_means,
    "stds": eval_result_active_stds
}

print(f"RAGAS Results for pipeline with baseline retrieval:")
print(eval_result_active_means)
print(eval_result_active_stds)

# save results to pickle in case of errors or crashes
pickle_agent_baseline_ragas_results = { 
    "raw_results": eval_result_active, 
    "summary_results": eval_summary_results }

# Save to file
with open(f"cert_agent_eval_ragas_results_for_baseline_retrieval.pkl", "wb") as f:
    pickle.dump(pickle_agent_baseline_ragas_results, f)


user input: How can parents help their child develop the executive function skills of the prefrontal cortex?
response ready ...
response: {'response': 'Parents can help their child develop the executive function skills of the prefrontal cortex through several positive and supportive strategies:\n\n1. Model Respectful Communication: Parents should model a respectful tone and avoid yelling, as children learn how to express emotions and solve problems from their parents\' behavior. A calm and respectful approach fosters better influence and helps children develop self-regulation.\n\n2. Strengthen the Relationship: Focus on building a strong relationship where the child feels seen, heard, understood, and valued. This encourages the child to be honest and cooperative. Daily one-on-one time, active listening, and connecting through the child\'s interests can build this foundation.\n\n3. Use Reflective Questions: Instead of lectures, ask thoughtful and warm questions that encourage the child 

CancelledError: 

Visualizations

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Create heatmap visualization
def create_heatmap_visualization(eval_summary_results):
    """
    Create a heatmap showing metric performance across chains
    """
    # Prepare data for heatmap
    heatmap_data = []
    chains = list(eval_summary_results.keys())
    metrics = ['llm_context_precision_with_reference', 'context_recall', 'context_entity_recall', 'faithfulness', 'response_relevancy']
    

    means = eval_summary_results["means"]
    row_data = [means[metric] for metric in metrics]
    heatmap_data.append(row_data)
    
    # Create DataFrame for heatmap
    df_heatmap = pd.DataFrame(
        heatmap_data, 
        index=[chain.replace('_', ' ').title() for chain in chains],
        columns=['Context Precision', 'Context Recall', 'Entity Recall', 'Faithfulness', 'Response Relevancy']
    )
    
    # Create the heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(df_heatmap, 
                annot=True, 
                cmap='RdYlGn',
                fmt='.3f',
                linewidths=0.5,
                cbar_kws={'label': 'Score'},
                square=True)
    
    plt.title('RAG Evaluation Metrics Heatmap\n(Darker = Better Performance)', 
              fontsize=14, fontweight='bold', pad=20)
    plt.xlabel('Metrics', fontsize=12, fontweight='bold')
    plt.ylabel('Retrieval Strategies', fontsize=12, fontweight='bold')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()
    
    return df_heatmap

# Generate alternative visualizations
print("\nCreating heatmap visualization...")
heatmap_df = create_heatmap_visualization(eval_summary_results)