# Evaluation
- Create QnA application
- Coming up with test data points
    - Hard-coded examples
    - LLM-Generated examples
    - Combine examples
- Evaluation
    - Manual Evaluation
    - LLM assisted evaluation

---

## Setup

In [None]:
import openai
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())
openai.api_type = os.environ.get("OPENAI_API_TYPE")
openai.api_base = os.environ.get("OPENAI_API_BASE")
openai.api_key = os.environ.get("OPENAI_API_KEY")
openai.api_version = os.environ.get("OPENAI_API_VERSION")

## Create QnA application

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch

In [None]:
file = ".../data/OutdoorClothingCatalog_1000.csv"
loader = CSVLoader(file_path=file)
data = loader.load()

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch, embedding=HuggingFaceEmbeddings()
).from_loaders([loader])

In [None]:
llm = AzureChatOpenAI(
    deployment_name="gpt4",
    temperature=0,
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=index.vectorstore.as_retriever(),
    verbose=True,
    chain_type_kwargs={"document_separator": "<<<<>>>>>"},
)

## Coming up with test datapoints

In [None]:
data[10]

In [None]:
data[11]

### Hard-coded examples

In [None]:
examples = [
    {
        "query": "Do the Cozy Comfort Pullover Set\
        have side pockets?",
        "answer": "Yes",
    },
    {
        "query": "What collection is the Ultra-Lofty \
        850 Stretch Down Hooded Jacket from?",
        "answer": "The DownTek collection",
    },
]

### LLM-Generated examples

In [None]:
from langchain.evaluation.qa import QAGenerateChain

example_gen_chain = QAGenerateChain.from_llm(llm)
new_examples = example_gen_chain.apply_and_parse([{"doc": t} for t in data[:5]])

In [None]:
new_examples[0]

In [None]:
data[0]

### Combine Examples

In [None]:
examples += new_examples

In [None]:
qa.run(examples[0]["query"])

## Evaluation

### Manual Evaluation

In [None]:
import langchain

langchain.debug = True

In [None]:
qa.run(examples[0]["query"])

In [None]:
# Turn off the debug mode
langchain.debug = False

### LLM assisted evaluation

In [None]:
predictions = qa.apply(examples)

In [None]:
from langchain.evaluation.qa import QAEvalChain

eval_chain = QAEvalChain.from_llm(llm)
graded_outputs = eval_chain.evaluate(examples, predictions)

In [None]:
# Check the detials of the evaluation
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]["query"])
    print("Real Answer: " + predictions[i]["answer"])
    print("Predicted Answer: " + predictions[i]["result"])
    print("Predicted Grade: " + graded_outputs[i]["text"])
    print()