# Wikipedia Roman Empire with LangChain

In [1]:
from getpass import getpass
import os

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")

In [2]:
from openai import OpenAI

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

def call_openai(prompt):
    response = client.completions.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            temperature=0,
            max_tokens=128,
            frequency_penalty=0,
            presence_penalty=0,
    )

    return response.choices[0].text

# Phoenix

In [3]:
import phoenix as px
session = px.launch_app()

INFO:phoenix.config:📋 Ensuring phoenix working directory: /Users/jaychung/.phoenix


🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [4]:
px.active_session().view()

📺 Opening a view to the Phoenix app. The app is running at http://localhost:6006/


In [5]:
from phoenix.trace.langchain import LangChainInstrumentor

# Initialize your LangChain application
# This might vary on your use-case. An example Chain is shown below
import bs4
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_community.retrievers import WikipediaRetriever

# Initialize Langchain auto-instrumentation
LangChainInstrumentor().instrument()

retriever = WikipediaRetriever()

# Set context corpus and vector store
docs = retriever.invoke("Roman Empire")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

I0000 00:00:1737916754.329680 2735026 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1737916757.941713 2734713 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers


In [6]:
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
from langchain import hub
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate

prompt = hub.pull("rlm/rag-prompt")

prompt_v1 = ChatPromptTemplate(
    input_variables=['context', 'question'],
    metadata={
        'lc_hub_owner': 'rlm',
        'lc_hub_repo': 'rag-prompt',
        'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'
    },
    messages=[
        HumanMessagePromptTemplate(
            prompt=PromptTemplate(
                input_variables=['context', 'question'],
                template="""
                You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. 
                Do not source information from other than the document excerpts provided. If you don't know the answer simply return nothing.
                Question: {question} 
                Context: {context} 
                Answer:"""
            )
        )
    ]
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_v1
    | llm
    | StrOutputParser()
)

trivia_questions = [
    "Who was the second to last emperor of the roman empire?",
    "What year is traditionally considered the founding of Rome?",
    "Who was the first emperor of Rome?",
    "What was the Roman name for the Mediterranean Sea?",
    "Which Roman general famously crossed the Rubicon River in 49 BC?",
    "What structure was used by Romans for gladiatorial contests and public spectacles?",
    "What was the Roman legal code called that influenced many modern legal systems?",
    "Which Roman emperor initiated the persecution of Christians in the early 4th century AD?",
    "In Roman mythology, who was the god of war?",
    "What was the primary language spoken in ancient Rome?",
    "What was the name of the Roman Senate's advisory council of aristocratic families?",
    "Which Roman Emperor is known for dividing the Empire into the Eastern and Western Roman Empires?",
    "What was the Roman term for a large estate or farm?",
    "Which Roman structure, completed in 80 AD, could hold up to 50,000 spectators?",
    "What was the primary purpose of the Roman aqueducts?",
    "Who was the Roman poet known for writing the epic poem 'The Aeneid'?",
    "Which Roman emperor is famous for building a massive wall across northern Britain?",
    "What was the Roman currency called?",
    "What type of Roman building was used for public bathing?",
    "Who was the Roman goddess of wisdom and warfare?",
    "Which Roman general was known for his conquest of Gaul?"
]

for i in range(len(trivia_questions)):
    response = rag_chain.invoke(trivia_questions[i])

In [7]:
spans_df = px.Client().get_spans_dataframe()
spans_df

  df_attributes = pd.DataFrame.from_records(


Unnamed: 0_level_0,name,span_kind,parent_id,start_time,end_time,status_code,status_message,events,context.span_id,context.trace_id,...,attributes.input.mime_type,attributes.llm.prompt_template.variables,attributes.llm.prompt_template.template,attributes.llm.token_count.total,attributes.llm.token_count.completion,attributes.llm.output_messages,attributes.llm.input_messages,attributes.llm.model_name,attributes.llm.token_count.prompt,attributes.llm.invocation_parameters
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aa4ba5aef445785c,WikipediaRetriever,RETRIEVER,,2025-01-26 18:39:13.626509+00:00,2025-01-26 18:39:16.612048+00:00,OK,,[],aa4ba5aef445785c,40c7ea7560fe000612dbf5375abd2894,...,,,,,,,,,,
ba042a8a8a032d49,RunnablePassthrough,CHAIN,2458807d696bdf48,2025-01-26 18:39:20.444350+00:00,2025-01-26 18:39:20.444620+00:00,OK,,[],ba042a8a8a032d49,34b7e4f25fe292d7c0c4948a7c8be4b2,...,,,,,,,,,,
525e9ad3502c1d3f,VectorStoreRetriever,RETRIEVER,49c542514bec8b37,2025-01-26 18:39:20.445166+00:00,2025-01-26 18:39:20.977114+00:00,OK,,[],525e9ad3502c1d3f,34b7e4f25fe292d7c0c4948a7c8be4b2,...,,,,,,,,,,
77528b7ecec4201f,format_docs,CHAIN,49c542514bec8b37,2025-01-26 18:39:20.982938+00:00,2025-01-26 18:39:20.984094+00:00,OK,,[],77528b7ecec4201f,34b7e4f25fe292d7c0c4948a7c8be4b2,...,application/json,,,,,,,,,
49c542514bec8b37,RunnableSequence,CHAIN,2458807d696bdf48,2025-01-26 18:39:20.444918+00:00,2025-01-26 18:39:20.987749+00:00,OK,,[],49c542514bec8b37,34b7e4f25fe292d7c0c4948a7c8be4b2,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25157c04571db771,"RunnableParallel<context,question>",CHAIN,3a62362d3dd5dce9,2025-01-26 18:39:39.795907+00:00,2025-01-26 18:39:40.255990+00:00,OK,,[],25157c04571db771,21c33629aec650c5d92f19ffc2007f2c,...,,,,,,,,,,
1b24320f18b047ef,ChatPromptTemplate,UNKNOWN,3a62362d3dd5dce9,2025-01-26 18:39:40.258154+00:00,2025-01-26 18:39:40.258691+00:00,OK,,[],1b24320f18b047ef,21c33629aec650c5d92f19ffc2007f2c,...,application/json,"{'context': 'By 100 BC, the city of Rome had e...",\n You are an assistant for que...,,,,,,,
63bb4656c3e24cdc,ChatOpenAI,LLM,3a62362d3dd5dce9,2025-01-26 18:39:40.260718+00:00,2025-01-26 18:39:40.584411+00:00,OK,,[],63bb4656c3e24cdc,21c33629aec650c5d92f19ffc2007f2c,...,application/json,,,763.0,4.0,"[{'message.content': 'Julius Caesar', 'message...",[{'message.content': '  You are...,gpt-3.5-turbo-0125,759.0,"{""model"": ""gpt-3.5-turbo-0125"", ""model_name"": ..."
edaee3b7d922236f,StrOutputParser,UNKNOWN,3a62362d3dd5dce9,2025-01-26 18:39:40.586986+00:00,2025-01-26 18:39:40.587325+00:00,OK,,[],edaee3b7d922236f,21c33629aec650c5d92f19ffc2007f2c,...,application/json,,,,,,,,,


In [8]:
from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents

retrieved_documents_df = get_retrieved_documents(px.active_session())
queries_df = get_qa_with_reference(px.active_session())

In [9]:
retrieved_documents_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,context.trace_id,input,reference
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aa4ba5aef445785c,0,40c7ea7560fe000612dbf5375abd2894,Roman Empire,The Roman Empire ruled the Mediterranean and m...
aa4ba5aef445785c,1,40c7ea7560fe000612dbf5375abd2894,Roman Empire,"The fall of the Western Roman Empire, also cal..."
aa4ba5aef445785c,2,40c7ea7560fe000612dbf5375abd2894,Roman Empire,"The Holy Roman Empire, also known as the Holy ..."
525e9ad3502c1d3f,0,34b7e4f25fe292d7c0c4948a7c8be4b2,Who was the second to last emperor of the roma...,"By 476, the position of Western Roman Emperor ..."
525e9ad3502c1d3f,1,34b7e4f25fe292d7c0c4948a7c8be4b2,Who was the second to last emperor of the roma...,The first two centuries of the Empire saw a pe...


In [10]:
queries_df.head()

Unnamed: 0_level_0,input,output,reference
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aa4ba5aef445785c,Roman Empire,"{""documents"": [""page_content='The Roman Empire...",The Roman Empire ruled the Mediterranean and m...
01ca49ff560fb979,Who was the second to last emperor of the roma...,Romulus Augustulus,"By 476, the position of Western Roman Emperor ..."
e584dbd812dc8913,What year is traditionally considered the foun...,The founding of Rome is traditionally consider...,"By 100 BC, the city of Rome had expanded its r..."
bd801a8c5cdeab4b,Who was the first emperor of Rome?,Augustus was the first emperor of Rome.,"By 100 BC, the city of Rome had expanded its r..."
d29d6c5826f1b1c2,What was the Roman name for the Mediterranean ...,"The Roman name for the Mediterranean Sea was ""...",The Roman Empire ruled the Mediterranean and m...


In [11]:
from phoenix.evals import OpenAIModel, HallucinationEvaluator, QAEvaluator
from phoenix.evals import run_evals
import nest_asyncio
nest_asyncio.apply()

api_key = os.environ["OPENAI_API_KEY"]
eval_model = OpenAIModel(model="gpt-4-turbo-preview", api_key=api_key)

# Define the evaluators
hallucination_evaluator = HallucinationEvaluator(eval_model)
qa_evaluator = QAEvaluator(eval_model)

hallucination_eval_df, qa_eval_df = run_evals(
    dataframe=queries_df,
    evaluators=[hallucination_evaluator, qa_evaluator],
    provide_explanation=True
)

# Log the evaluations
from phoenix.trace import SpanEvaluations

px.Client().log_evaluations(
    SpanEvaluations(eval_name="Hallucination", dataframe=hallucination_eval_df),
    SpanEvaluations(eval_name="QA Correctness", dataframe=qa_eval_df)
)

I0000 00:00:1737916785.643314 2734713 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers


run_evals |          | 0/44 (0.0%) | ⏳ 00:00<? | ?it/s

I0000 00:00:1737916786.008407 2734713 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers


In [12]:
hallucination_eval_df.head()

Unnamed: 0_level_0,label,score,explanation
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aa4ba5aef445785c,factual,0,The provided answer is a direct extraction fro...
01ca49ff560fb979,hallucinated,1,The query asks for the identity of the second ...
e584dbd812dc8913,factual,0,The reference text explicitly mentions that Ro...
bd801a8c5cdeab4b,factual,0,The reference text explicitly states that in 2...
d29d6c5826f1b1c2,hallucinated,1,The reference text provided does not mention t...


In [13]:
qa_eval_df.head()

Unnamed: 0_level_0,label,score,explanation
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aa4ba5aef445785c,correct,1,"The question provided is ""Roman Empire,"" which..."
01ca49ff560fb979,incorrect,0,The question asks for the identity of the seco...
e584dbd812dc8913,correct,1,The reference text explicitly states that Rome...
bd801a8c5cdeab4b,correct,1,"To determine if the answer is correct, we anal..."
d29d6c5826f1b1c2,incorrect,0,The reference text provided does not mention t...


In [14]:
import pandas as pd
joined_df = pd.merge(hallucination_eval_df, queries_df, on='context.span_id')

joined_df.head()

Unnamed: 0_level_0,label,score,explanation,input,output,reference
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aa4ba5aef445785c,factual,0,The provided answer is a direct extraction fro...,Roman Empire,"{""documents"": [""page_content='The Roman Empire...",The Roman Empire ruled the Mediterranean and m...
01ca49ff560fb979,hallucinated,1,The query asks for the identity of the second ...,Who was the second to last emperor of the roma...,Romulus Augustulus,"By 476, the position of Western Roman Emperor ..."
e584dbd812dc8913,factual,0,The reference text explicitly mentions that Ro...,What year is traditionally considered the foun...,The founding of Rome is traditionally consider...,"By 100 BC, the city of Rome had expanded its r..."
bd801a8c5cdeab4b,factual,0,The reference text explicitly states that in 2...,Who was the first emperor of Rome?,Augustus was the first emperor of Rome.,"By 100 BC, the city of Rome had expanded its r..."
d29d6c5826f1b1c2,hallucinated,1,The reference text provided does not mention t...,What was the Roman name for the Mediterranean ...,"The Roman name for the Mediterranean Sea was ""...",The Roman Empire ruled the Mediterranean and m...


In [15]:
joined_df_2 =  pd.merge(qa_eval_df, queries_df, on='context.span_id')

In [16]:
joined_df_2.head()

Unnamed: 0_level_0,label,score,explanation,input,output,reference
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aa4ba5aef445785c,correct,1,"The question provided is ""Roman Empire,"" which...",Roman Empire,"{""documents"": [""page_content='The Roman Empire...",The Roman Empire ruled the Mediterranean and m...
01ca49ff560fb979,incorrect,0,The question asks for the identity of the seco...,Who was the second to last emperor of the roma...,Romulus Augustulus,"By 476, the position of Western Roman Emperor ..."
e584dbd812dc8913,correct,1,The reference text explicitly states that Rome...,What year is traditionally considered the foun...,The founding of Rome is traditionally consider...,"By 100 BC, the city of Rome had expanded its r..."
bd801a8c5cdeab4b,correct,1,"To determine if the answer is correct, we anal...",Who was the first emperor of Rome?,Augustus was the first emperor of Rome.,"By 100 BC, the city of Rome had expanded its r..."
d29d6c5826f1b1c2,incorrect,0,The reference text provided does not mention t...,What was the Roman name for the Mediterranean ...,"The Roman name for the Mediterranean Sea was ""...",The Roman Empire ruled the Mediterranean and m...


In [17]:
correctness_hallucination =  pd.merge(qa_eval_df, hallucination_eval_df, on='context.span_id')

In [18]:
correctness_hallucination.head()

Unnamed: 0_level_0,label_x,score_x,explanation_x,label_y,score_y,explanation_y
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aa4ba5aef445785c,correct,1,"The question provided is ""Roman Empire,"" which...",factual,0,The provided answer is a direct extraction fro...
01ca49ff560fb979,incorrect,0,The question asks for the identity of the seco...,hallucinated,1,The query asks for the identity of the second ...
e584dbd812dc8913,correct,1,The reference text explicitly states that Rome...,factual,0,The reference text explicitly mentions that Ro...
bd801a8c5cdeab4b,correct,1,"To determine if the answer is correct, we anal...",factual,0,The reference text explicitly states that in 2...
d29d6c5826f1b1c2,incorrect,0,The reference text provided does not mention t...,hallucinated,1,The reference text provided does not mention t...


In [19]:
correctness_hallucination[(correctness_hallucination['label_y'] == 'factual') & (correctness_hallucination['label_x'] == 'incorrect')]

Unnamed: 0_level_0,label_x,score_x,explanation_x,label_y,score_y,explanation_y
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


- why don't correctness and hallucination dfs come with input, output?

In [20]:
from phoenix.evals import (
    RAG_RELEVANCY_PROMPT_RAILS_MAP,
    RAG_RELEVANCY_PROMPT_TEMPLATE,
    OpenAIModel,
    llm_classify,
)

retrieved_documents_eval = llm_classify(
    dataframe=retrieved_documents_df,
    model=OpenAIModel(model="gpt-4o", temperature=0.0),
    template=RAG_RELEVANCY_PROMPT_TEMPLATE,
    rails=list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()),
    provide_explanation=True,
)

retrieved_documents_eval["score"] = (
    retrieved_documents_eval.label[~retrieved_documents_eval.label.isna()] == "relevant"
).astype(int)

llm_classify |          | 0/87 (0.0%) | ⏳ 00:00<? | ?it/s

In [21]:
retrieved_documents_eval.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,label,explanation,exceptions,execution_status,execution_seconds,score
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
aa4ba5aef445785c,0,relevant,"The question is about the 'Roman Empire,' whic...",[],COMPLETED,2.381172,1
aa4ba5aef445785c,1,relevant,"The question is simply 'Roman Empire,' which i...",[],COMPLETED,3.717477,1
aa4ba5aef445785c,2,unrelated,"The question is about the 'Roman Empire,' whic...",[],COMPLETED,4.857805,0
525e9ad3502c1d3f,0,unrelated,The question asks for the identity of the seco...,[],COMPLETED,2.690012,0
525e9ad3502c1d3f,1,unrelated,To determine if the reference text is relevant...,[],COMPLETED,3.86702,0


In [22]:
from phoenix.trace import DocumentEvaluations

px.Client().log_evaluations(
    DocumentEvaluations(eval_name="Relevance", dataframe=retrieved_documents_eval)
)

In [23]:
queries_df.head()

Unnamed: 0_level_0,input,output,reference
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aa4ba5aef445785c,Roman Empire,"{""documents"": [""page_content='The Roman Empire...",The Roman Empire ruled the Mediterranean and m...
01ca49ff560fb979,Who was the second to last emperor of the roma...,Romulus Augustulus,"By 476, the position of Western Roman Emperor ..."
e584dbd812dc8913,What year is traditionally considered the foun...,The founding of Rome is traditionally consider...,"By 100 BC, the city of Rome had expanded its r..."
bd801a8c5cdeab4b,Who was the first emperor of Rome?,Augustus was the first emperor of Rome.,"By 100 BC, the city of Rome had expanded its r..."
d29d6c5826f1b1c2,What was the Roman name for the Mediterranean ...,"The Roman name for the Mediterranean Sea was ""...",The Roman Empire ruled the Mediterranean and m...


In [24]:
from openai import OpenAI
client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

queries_df['ada_embedding'] = queries_df["input"].apply(get_embedding)
queries_df.head()

Unnamed: 0_level_0,input,output,reference,ada_embedding
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aa4ba5aef445785c,Roman Empire,"{""documents"": [""page_content='The Roman Empire...",The Roman Empire ruled the Mediterranean and m...,"[-0.049663741141557693, 0.01780897006392479, 0..."
01ca49ff560fb979,Who was the second to last emperor of the roma...,Romulus Augustulus,"By 476, the position of Western Roman Emperor ...","[0.0034670475870370865, 0.018989551812410355, ..."
e584dbd812dc8913,What year is traditionally considered the foun...,The founding of Rome is traditionally consider...,"By 100 BC, the city of Rome had expanded its r...","[0.02312874048948288, -0.0018601700430735946, ..."
bd801a8c5cdeab4b,Who was the first emperor of Rome?,Augustus was the first emperor of Rome.,"By 100 BC, the city of Rome had expanded its r...","[0.012941939756274223, -0.03062393143773079, -..."
d29d6c5826f1b1c2,What was the Roman name for the Mediterranean ...,"The Roman name for the Mediterranean Sea was ""...",The Roman Empire ruled the Mediterranean and m...,"[0.000884795212186873, 0.019955161958932877, 0..."


# evaluation

In [25]:
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "http://localhost:6006"

# experiments - NBA toy dataset

In [26]:
# import phoenix as px
# import openai
# from textwrap import dedent

# # Create a list of dictionaries with examples
# inputs = [{"question": "Which team won the most games?"}, {"question": "Which team won the most games in 2015?"}, {"question": "Who led the league in 3 point shots?"}]
# outputs = [{"answer": "Team A"}, {"answer": "Team B"}, {"answer": "Player X"}]

# # Upload the dataset to Phoenix
# client = px.Client()
# dataset = client.upload_dataset(
#     inputs=inputs,
#     outputs=outputs,
#     dataset_name="nba-questions-v4"
# )

# openai.api_key = os.environ["OPENAI_API_KEY"]

# # Define the text2sql function using the new API interface
# def text2sql(question):
#     try:
#         response = openai.chat.completions.create(
#             model="gpt-3.5-turbo",
#             messages=[
#                 {"role": "system", "content": dedent("""
#                     You are a SQL expert, and you are given a table named nba_players with the following columns: player_name, three_point_shots. 
#                     Another table named nba_teams with columns: team_name, season, wins. 
#                     Write a SQL query corresponding to the user's request. Return just the query text, with no formatting (backticks, markdown, etc.).
#                 """)},
#                 {"role": "user", "content": question}
#             ]
#         )
#         query = response.choices[0].message.content.strip()
#         return {"results": query, "error": None}
#     except Exception as e:
#         return {"results": None, "error": str(e)}

# def task(x):
#     return text2sql(x["question"])

# def no_error(output) -> bool:
#     return output.get("error") is None

# def has_results(output) -> bool:
#     return bool(output.get("results"))

# from phoenix.trace.openai import OpenAIInstrumentor

# OpenAIInstrumentor().instrument()

# from phoenix.experiments import run_experiment

# run_experiment(dataset, task=task, evaluators=[no_error, has_results])


# experiment on Roman Empire RAG

In [28]:
import numpy as np
import phoenix as px
import openai
from textwrap import dedent

# dataset
client = px.Client(endpoint="http://localhost:6006")
dataset = client.get_dataset(id="RGF0YXNldDox")

openai.api_key = os.environ["OPENAI_API_KEY"]

# define task
def ask_chat(question):
    try:
        response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": dedent("""
                    You are an expert on Roman History. If you don't know the answer, just say you don't know.
                """)},
                {"role": "user", "content": question}
            ]
        )
        query = response.choices[0].message.content.strip()
        return {"results": query, "error": None}
    except Exception as e:
        return {"results": None, "error": str(e)}
    
def task(x):
    return ask_chat(x["input"])

# define evaluators
def no_error(output) -> bool:
    return output.get("error") is None

def has_results(output) -> bool:
    return output.get("results") is not None

def correctness(output, expected) -> bool:
    result = output.get("results")
    expected_output = expected.get("output")
    
    # Get embeddings for result and expected_output
    result_embedding_response = openai.embeddings.create(input=result, model="text-embedding-ada-002")
    expected_embedding_response = openai.embeddings.create(input=expected_output, model="text-embedding-ada-002")

    # Extract embeddings from the responses
    result_embedding = result_embedding_response.data[0].embedding
    expected_embedding = expected_embedding_response.data[0].embedding

    # Calculate cosine similarity
    similarity = np.dot(result_embedding, expected_embedding) / (np.linalg.norm(result_embedding) * np.linalg.norm(expected_embedding))
    
    # Define a threshold for semantic similarity
    threshold = 0.90
    return similarity >= threshold


from phoenix.trace.openai import OpenAIInstrumentor

OpenAIInstrumentor().instrument()

from phoenix.experiments import run_experiment

experiment = run_experiment(dataset, task=task, evaluators=[no_error, has_results, correctness])

HTTPStatusError: Client error '404 Not Found' for url 'http://localhost:6006/v1/datasets/RGF0YXNldDox/examples'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404

In [None]:
evaluations = experiment.get_evaluations()
print(evaluations)

                                 name  score label  \
run_id                                               
RXhwZXJpbWVudFJ1bjox         no_error    1.0  True   
RXhwZXJpbWVudFJ1bjox      has_results    1.0  True   
RXhwZXJpbWVudFJ1bjox      correctness    1.0  None   
RXhwZXJpbWVudFJ1bjoy         no_error    1.0  True   
RXhwZXJpbWVudFJ1bjoy      has_results    1.0  True   
RXhwZXJpbWVudFJ1bjoy      correctness    1.0  None   
RXhwZXJpbWVudFJ1bjoz         no_error    1.0  True   
RXhwZXJpbWVudFJ1bjoz      has_results    1.0  True   
RXhwZXJpbWVudFJ1bjoz      correctness    0.0  None   
RXhwZXJpbWVudFJ1bjo0         no_error    1.0  True   
RXhwZXJpbWVudFJ1bjo0      has_results    1.0  True   
RXhwZXJpbWVudFJ1bjo0      correctness    1.0  None   
RXhwZXJpbWVudFJ1bjo1         no_error    1.0  True   
RXhwZXJpbWVudFJ1bjo1      has_results    1.0  True   
RXhwZXJpbWVudFJ1bjo1      correctness    1.0  None   
RXhwZXJpbWVudFJ1bjo2         no_error    1.0  True   
RXhwZXJpbWVudFJ1bjo2      ha