In [3]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
# from ragas.langchain.evalchain import RagasEvaluatorChain
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall

# Define a template to generate multiple queries
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)


In [None]:

# Chain to generate multiple queries
generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

# Retrieve documents and get unique union
from langchain.load import dumps, loads
def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    unique_docs = list(set(flattened_docs))
    return [loads(doc) for doc in unique_docs]

question = "What is task decomposition for LLM agents?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question": question})

# Create a RAG chain
template = """Answer the following question based on this context:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
llm = ChatOpenAI(temperature=0)

final_rag_chain = (
    {"context": retrieval_chain, "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

# Example test cases
test_cases = [
    {
        "input": "What is task decomposition for LLM agents?",
        "expected_output": "Task decomposition for LLM agents involves breaking down a complex task into smaller, manageable sub-tasks that can be solved individually by the LLM."
    },
    {
        "input": "How do LLM agents handle large tasks?",
        "expected_output": "LLM agents handle large tasks by dividing them into smaller components and addressing each part separately, often using task decomposition techniques."
    }
]

# Create evaluation chains
faithfulness_chain = RagasEvaluatorChain(metric=faithfulness)
answer_rel_chain = RagasEvaluatorChain(metric=answer_relevancy)
context_rel_chain = RagasEvaluatorChain(metric=context_precision)
context_recall_chain = RagasEvaluatorChain(metric=context_recall)

# Evaluate the generated responses for each test case
evaluation_results = []
for test_case in test_cases:
    question = test_case["input"]
    expected_output = test_case["expected_output"]

    # Generate multiple queries and retrieve documents
    retrieval_chain = generate_queries | retriever.map() | get_unique_union
    docs = retrieval_chain.invoke({"question": question})

    # Generate the answer using the RAG chain
    answer = final_rag_chain.invoke({"question": question})

    # Evaluate the answer
    eval_result = {
        "input": question,
        "generated_answer": answer,
        "expected_answer": expected_output,
        "faithfulness_score": faithfulness_chain(answer)["faithfulness_score"],
        "answer_relevancy_score": answer_rel_chain(answer)["answer_relevancy_score"],
        "context_precision_score": context_rel_chain(answer)["context_precision_score"],
        "context_recall_score": context_recall_chain(answer)["context_recall_score"]
    }
    evaluation_results.append(eval_result)

# Calculate total scores and rank the responses
def calculate_total_score(eval_result):
    weights = {
        "faithfulness_score": 0.25,
        "answer_relevancy_score": 0.25,
        "context_precision_score": 0.25,
        "context_recall_score": 0.25
    }
    total_score = sum(eval_result[metric] * weight for metric, weight in weights.items())
    return total_score

for eval_result in evaluation_results:
    eval_result["total_score"] = calculate_total_score(eval_result)

ranked_responses = sorted(evaluation_results, key=lambda x: x["total_score"], reverse=True)

for rank, response in enumerate(ranked_responses, 1):
    print(f"Rank {rank}:")
    print(f"Input: {response['input']}")
    print(f"Generated Answer: {response['generated_answer']}")
    print(f"Expected Answer: {response['expected_answer']}")
    print(f"Total Score: {response['total_score']}")
    print("Scores:", {k: v for k, v in response.items() if k.endswith('_score')})
    print("\n")
