In [None]:
# Import necessary modules
from llama_index.core.evaluation import MultiModalRetrieverEvaluator
import re
import json
import tqdm
import pandas as pd
from llama_index.core.evaluation import EvaluationResult
from llama_index.core.evaluation.notebook_utils import get_eval_results_df

# Define retriever evaluators
clip_evaluator = MultiModalRetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=clip_retriever
)

text_desc_evaluator = MultiModalRetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=text_desc_retriever
)

# Create labeled datasets for evaluation
image_qa_dataset = create_labelled_retrieval_dataset(
    r"(?:([A-Z]+).jpg)", image_nodes, "image"
)

text_qa_dataset = create_labelled_retrieval_dataset(
    r"(?:To sign ([A-Z]+) in ASL:)", text_nodes, "text"
)

text_desc_qa_dataset = create_labelled_retrieval_dataset(
    r"(?:([A-Z]+).jpg)", image_with_text_nodes, "image"
)

# Evaluate the retrievers on the datasets
eval_results_image = await clip_evaluator.evaluate_dataset(image_qa_dataset)
eval_results_text = await clip_evaluator.evaluate_dataset(text_qa_dataset)
eval_results_text_desc = await text_desc_evaluator.evaluate_dataset(text_desc_qa_dataset)

# Convert evaluation results to DataFrame
results_df = get_retrieval_results_df(
    names=["image_retrieval", "text_retrieval", "text_desc_retrieval"],
    results_arr=[
        eval_results_image,
        eval_results_text,
        eval_results_text_desc,
    ],
)

# Load human-labeled ground-truth data
with open("asl_data/human_responses.json") as json_file:
    human_answers = json.load(json_file)

# Load or evaluate responses
load_previous_responses = True
number_evals = 27

if not load_previous_responses:
    response_data = []
    for letter in tqdm.tqdm(asl_text_descriptions.keys()):
        data_entry = {}
        query = QUERY_STR_TEMPLATE.format(symbol=letter)
        data_entry["query"] = query

        responses = {}
        for name, engine in rag_engines.items():
            this_response = {}
            result = engine.query(query)
            this_response["response"] = result.response

            sources = {}
            source_image_nodes = []
            source_text_nodes = []

            # image sources
            source_image_nodes = [
                score_img_node.node.metadata["file_path"]
                for score_img_node in result.metadata["image_nodes"]
            ]

            # text sources
            source_text_nodes = [
                score_text_node.node.text
                for score_text_node in result.metadata["text_nodes"]
            ]

            sources["images"] = source_image_nodes
            sources["texts"] = source_text_nodes
            this_response["sources"] = sources

            responses[name] = this_response
        data_entry["responses"] = responses
        response_data.append(data_entry)

    # Save responses
    with open("asl_data/response_data.json", "w") as json_file:
        json.dump(response_data, json_file)
else:
    # Load previous responses
    with open("asl_data/response_data.json") as json_file:
        response_data = json.load(json_file)

# Define judges for evaluation
judges = {}
judges["correctness"] = CorrectnessEvaluator(
    llm=OpenAI(temperature=0, model="gpt-4"),
)
judges["relevancy"] = MultiModalRelevancyEvaluator(
    multi_modal_llm=OpenAIMultiModal(
        model="gpt-4-vision-preview",
        max_new_tokens=300,
    )
)
judges["faithfulness"] = MultiModalFaithfulnessEvaluator(
    multi_modal_llm=OpenAIMultiModal(
        model="gpt-4-vision-preview",
        max_new_tokens=300,
    )
)

# Load or evaluate previous evaluations
load_previous_evaluations = True

if not load_previous_evaluations:
    evals = {
        "names": [],
        "correctness": [],
        "relevancy": [],
        "faithfulness": [],
    }

    # Loop through responses and evaluate them
    for data_entry in tqdm.tqdm(response_data[:number_evals]):
        reg_ex = r"(?:How can I sign a ([A-Z]+)?)"
        match = re.search(reg_ex, data_entry["query"])

        batch_names = []
        batch_correctness = []
        batch_relevancy = []
        batch_faithfulness = []
        if match:
            letter = match.group(1)
            reference_answer = human_answers[letter]
            for rag_name, rag_response_data in data_entry["responses"].items():
                correctness_result = await judges["correctness"].evaluate(
                    query=data_entry["query"],
                    response=rag_response_data["response"],
                    reference=reference_answer,
                )

                relevancy_result = judges["relevancy"].evaluate(
                    query=data_entry["query"],
                    response=rag_response_data["response"],
                    contexts=rag_response_data["sources"]["texts"],
                    image_paths=rag_response_data["sources"]["images"],
                )

                faithfulness_result = judges["faithfulness"].evaluate(
                    query=data_entry["query"],
                    response=rag_response_data["response"],
                    contexts=rag_response_data["sources"]["texts"],
                    image_paths=rag_response_data["sources"]["images"],
                )

                batch_names.append(rag_name)
                batch_correctness.append(correctness_result)
                batch_relevancy.append(relevancy_result)
                batch_faithfulness.append(faithfulness_result)

            evals["names"] += batch_names
            evals["correctness"] += batch_correctness
            evals["relevancy"] += batch_relevancy
            evals["faithfulness"] += batch_faithfulness

    # Save evaluations
    evaluations_objects = {
        "names": evals["names"],
        "correctness": [e.dict() for e in evals["correctness"]],
        "faithfulness": [e.dict() for e in evals["faithfulness"]],
        "relevancy": [e.dict() for e in evals["relevancy"]],
    }
    with open("asl_data/evaluations.json", "w") as json_file:
        json.dump(evaluations_objects, json_file)
else:
    # Load previous evaluations
    with open("asl_data/evaluations.json") as json_file:
        evaluations_objects = json.load(json_file)

    evals = {}
    evals["names"] = evaluations_objects["names"]
    evals["correctness"] = [
        EvaluationResult.parse_obj(e)
        for e in evaluations_objects["correctness"]
    ]
    evals["faithfulness"] = [
        EvaluationResult.parse_obj(e)
        for e in evaluations_objects["faithfulness"]
    ]
    evals["relevancy"] = [
        EvaluationResult.parse_obj(e) for e in evaluations_objects["relevancy"]
    ]

# Get evaluation results DataFrame
deep_eval_df, mean_correctness_df = get_eval_results_df(
    evals["names"], evals["correctness"], metric="correctness"
)
_, mean_relevancy_df = get_eval_results_df(
    evals["names"], evals["relevancy"], metric="relevancy"
)
_, mean_faithfulness_df = get_eval_results_df(
    evals["names"], evals["faithfulness"], metric="faithfulness"
)

mean_scores_df = pd.concat(
    [
        mean_correctness_df.reset_index(),
        mean_relevancy_df.reset_index(),
        mean_faithfulness_df.reset_index(),
    ],
    axis=0,
    ignore_index=True,
)
mean_scores_df = mean_scores_df.set_index("index")
mean_scores_df.index = mean_scores_df.index.set_names(["metrics"])

print(deep_eval_df[:4])
mean_scores_df
