# Evaluations

This notebook shows how to pull traces from a running phoenix instance and evaluate them using the `arize-phoenix-evals` library.

In [1]:
%pip install "arize-phoenix[evals]" openai nest_asyncio arize-phoenix

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Run async evaluation in the notebook
import nest_asyncio

nest_asyncio.apply()

OPEN_AI_API_KEY = ""

In [3]:
import phoenix as px

client = px.Client(endpoint="http://localhost:6006")

SyntaxError: invalid syntax (__init__.py, line 56)

In [None]:
from datetime import datetime, timedelta
from phoenix.trace.dsl.helpers import get_qa_with_reference, get_retrieved_documents

qa_df = get_qa_with_reference(client)
documents_df = get_retrieved_documents(client)

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 800)
qa_df.head()

In [None]:
documents_df.head()

In [None]:
## Evaluate Retrieval

from phoenix.evals import (
    OpenAIModel,
    RelevanceEvaluator,
    run_evals,
)

relevance_evaluator = RelevanceEvaluator(OpenAIModel(model="gpt-4-turbo-preview", api_key=OPEN_AI_API_KEY))

relevance_evals = run_evals(
    evaluators=[relevance_evaluator],
    dataframe=documents_df,
    provide_explanation=True,
    concurrency=20,
)[0]

In [None]:
relevance_evals.head()

In [None]:
## Evaluate Responses

from phoenix.evals import (
    OpenAIModel,
    QAEvaluator,
    HallucinationEvaluator,
    run_evals,
)

qa_evaluator = QAEvaluator(OpenAIModel(model="gpt-4-turbo-preview", api_key=OPEN_AI_API_KEY))
hallucination_evaluator = HallucinationEvaluator(OpenAIModel(model="gpt-4-turbo-preview", api_key=OPEN_AI_API_KEY))

qa_evals, hallucination_evals = run_evals(
    evaluators=[qa_evaluator, hallucination_evaluator],
    dataframe=qa_df,
    provide_explanation=True,
    concurrency=20,
)

Add custom eval

In [None]:
ANSWER_RELEVANCE_TEMPLATE = ''' In this task, you will be presented with a query, a reference text and an answer. The answer is
generated to the question based on the reference text. The answer may contain irrelevant information. For the provided list of statements, 
determine whether each statement is relevant to address the input. If one or more statements are not relevant to the query, please label the answer 
as "irrelevant". If all statements are relevant to the query, please label the answer as "relevant".

Here is an example where the answer is "relevant" because the answer includes a recommendation for a winery with white wines (Chardonnay):
    # Query: What's a good place to go wine tasting for white wines in Napa?
    # Answer: Castle Winery has amazing Chardonnay.

Here is an example where the answer is "irrelevant" because the query is asking about white wines, but the answer recommends a winery based on its red wine (Cabernet):
    # Query: Where can I go wine tasting for white wines in Napa?
    # Answer: Stags Leap has great Cabernet.

Please provide your evaluation for the query and answer below:
    # Query: {input}
    # Answer: {output}

Is the answer above relevant or irrelevant to the above query?'''

In [None]:
# import phoenix.experimental.evals.templates.default_templates as templates
from phoenix.evals import (
    llm_classify,
)

In [None]:
custom_qa_relevance_classifications = llm_classify(
    dataframe=qa_df, 
    template=ANSWER_RELEVANCE_TEMPLATE, 
    model=OpenAIModel(model="gpt-4-turbo-preview", api_key=OPEN_AI_API_KEY), 
    rails=["relevant", "irrelevant"],
    provide_explanation=True, #optional to generate explanations for the value produced by the eval LLM
)

In [None]:
custom_qa_relevance_classifications.head()

In [None]:
from phoenix.trace import DocumentEvaluations, SpanEvaluations

# Log the evaluations back to
client.log_evaluations(DocumentEvaluations(dataframe=relevance_evals, eval_name="document_relevance"),
                       SpanEvaluations(dataframe=custom_qa_relevance_classifications, eval_name="answer_relevance"),
                       SpanEvaluations(dataframe=qa_evals, eval_name="qa"),
                       SpanEvaluations(dataframe=hallucination_evals, eval_name="hallucination"))

In [None]:
spans_df = px.Client().get_spans_dataframe()

In [None]:
from arize.pandas.logger import Client

SPACE_KEY = "3b83d7a"
API_KEY = "3e323f22a13bf91e0b5"


if SPACE_KEY == "SPACE_KEY" or API_KEY == "API_KEY":
    raise ValueError("❌ NEED TO CHANGE SPACE AND/OR API_KEY")
else:
    print("✅ Import and Setup Arize Client Done! Now we can start using Arize!")
    
arize_client = Client(space_key=SPACE_KEY, api_key=API_KEY)
model_id = "generative-spans-tutorial-test" # the model name in Arize
model_version = "1.0" # (optional) the model version

response = arize_client.log_spans(
    dataframe=spans_df,
    model_id=model_id,
    model_version=model_version, # optional
)

# If successful, the server will return a status_code of 200
if response.status_code != 200:
    print(f"❌ logging failed with response code {response.status_code}, {response.text}")
else:
    print(f"✅ You have successfully logged training set to Arize")