# QA Eval

In [1]:
from langchain.evaluation.qa.chat_eval_chain import QAEvalChatChain
from langchain.chat_models import ChatOpenAI

model = ChatOpenAI(temperature=0)

eval_chain = QAEvalChatChain.from_model(model)

In [4]:
examples = [
    {
        "question": "Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?",
        "answer": "11"
    },
    {
        "question": 'Is the following sentence plausible? "Joao Moutinho caught the screen pass in the NFC championship."',
        "answer": "No"
    }
]

In [5]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import OpenAI

In [6]:
prompt = PromptTemplate(template="Question: {question}\nAnswer:", input_variables=["question"])
llm = OpenAI(model_name="text-davinci-003", temperature=0)
chain = LLMChain(llm=llm, prompt=prompt)

In [7]:
predictions = chain.apply(examples)

In [8]:
graded_outputs = eval_chain.evaluate(examples, predictions, question_key="question", prediction_key="text")

In [9]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + eg['question'])
    print("Real Answer: " + eg['answer'])
    print("Predicted Answer: " + predictions[i]['text'])
    print("Predicted Grade: " + graded_outputs[i]['text'])
    print()

Example 0:
Question: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
Real Answer: 11
Predicted Answer:  11 tennis balls
Predicted Grade: GRADE: CORRECT

Example 1:
Question: Is the following sentence plausible? "Joao Moutinho caught the screen pass in the NFC championship."
Real Answer: No
Predicted Answer:  No, this sentence is not plausible. Joao Moutinho is a professional soccer player, not an American football player, so it is not likely that he would be catching a screen pass in the NFC championship.
Predicted Grade: GRADE: CORRECT

