In [8]:
%load_ext autoreload
%autoreload 2

In [1]:
from cli_evaluate import *
from cli_create_dataset import create_dataset

Test parameters:

In [19]:
vector_store = "example_vs"
# Path or None
# If a path is provided, params file_paths, chunk_chars, overlap, and split_method will be overridden by the vector 
# store. Param embeddings needs to be manually set to correspond with the vector store embeddings.

file_paths = ["docs/fee_brochure.pdf", "docs/fee_rules"]

dataset_file = "datasets/example.csv"
# Path or None

num_eval_questions = 10

chunk_chars = 1500

overlap = 0

split_method = "RecursiveTextSplitter"
# Available options:
    # "RecursiveTextSplitter"
    # "CharacterTextSplitter"
    
embeddings = "OpenAI"
# Available options:
    # "OpenAI"
    # "gte-large"
    # "FastText"
    # Hugging Face path, e.g. "thenlper/gte-large"
    
model_version = "gpt-3.5-turbo"
# Available options:
    # "gpt-3.5-turbo"
    # "gpt-4"
    # "llama2"
    
grade_prompt = "OpenAI grading prompt"
# Available options:
    # "Fast" - only Correct/Incorrect
    # "Descriptive w/ bias check"
    # "OpenAI grading prompt" - descriptive w/o bias check
    # "Basic" - Correctness + basic justification
        
num_neighbors = 3
# Number of retrieved source documents


params = {
    "file_paths": file_paths,
    "dataset_file": dataset_file,
    "num_eval_questions": num_eval_questions,
    "chunk_chars": chunk_chars,
    "overlap": overlap,
    "split_method": split_method,
    "embeddings": embeddings,
    "model_version": model_version,
    "grade_prompt": grade_prompt,
    "num_neighbors": num_neighbors,
}

Test logic:

Load or generate dataset:

In [3]:
if dataset_file is None:
    test_dataset = create_dataset(file_paths, num_eval_questions, 3000)
else:        
    test_dataset = pd.read_csv(dataset_file)[["question", "answer"]].to_dict('records')
    print(f"Loaded {len(test_dataset)} questions")

Loaded 10 questions


Save dataset:

In [65]:
dataset_df = pd.DataFrame(test_dataset)
dataset_df.to_csv("datasets/name.csv")

Load or create vector store:

In [4]:
embd = make_embeddings(embeddings)
if vector_store is not None:
    vs = FAISS.load_local(vector_store, embd)
    print(f"Loaded {vector_store}")
else:
    splits = []
    for path in file_paths:
        loader = PyMuPDFLoader(path) # Fast, Good for metadata
        if split_method == "RecursiveTextSplitter":
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_chars,
                                                           chunk_overlap=overlap)
        elif split_method == "CharacterTextSplitter":
            text_splitter = CharacterTextSplitter(separator=" ",
                                                  chunk_size=chunk_chars,
                                                  chunk_overlap=overlap)
        else:
            raise Exception("Invalid text splitter")
    
        local_pages = loader.load_and_split(text_splitter)
        splits.extend(local_pages)
    vs = FAISS.from_documents(splits, embd)
    print("New vector store created")

Loaded example_vs


Save vector store:

In [6]:
vs.save_local("vs_name")

Run test:

In [20]:
retriever = vs.as_retriever(k=num_neighbors)
results = run_evaluator(retriever, test_dataset, num_eval_questions, model_version, grade_prompt)
score = results["answerScore"].mean()
score_str = f"Total score: {score} ({len(results['answerScore'][results['answerScore'] == 1])} / {len(results['answerScore'])})"
print("\n")
print(score_str)
lat = results[results['latency'] < results['latency'].quantile(0.99)]['latency'].mean()
print(f"Avg. latency: {lat}")

Evaluating...
Question 10 / 10, Correct: 6 / 9, ACC: 0.6666666666666666

Total score: 0.6 (6 / 10)
Avg. latency: 1.6852789454989963


Show incorrect examples first (optional):

In [6]:
results = results.sort_values("answerScore")

Save results:

In [21]:
results[score_str] = ""
results[f"Parameters: {str(params)}"] = ""
results.loc[results.index[0], score_str] = f"Typical latency: {lat}"
results.to_csv("results/test.csv")