# Chatbot Test Notebook

This notebook is designed to test and experiment with the functionality of the chatbot implemented in the `rag_swimrules` project. It showcases various scenarios and evaluates the performance of the chatbot.

In [1]:
import textwrap
import json
import pandas as pd
import time
import random
import pickle

from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

from langchain_chroma.vectorstores import Chroma
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

from openevals.prompts import CORRECTNESS_PROMPT
from openevals.llm import create_llm_as_judge


In [2]:
RANDOM_SAMPLE_SIZE = None
VECTORDB = "semantic"   # "_techrules"  # "_semantic"
RAG_LLM = "gpt-3.5-turbo"   #"gpt-3.5-turbo" #"gpt-4o-mini" #"gpt-4o"

In [3]:


swimming_rules_template = """You are an expert swimming rules assistant trained on FINA and competitive swimming regulations. 
Your answers must be precise, factual, and based only on the provided rules. Follow these guidelines:

1. Answer ONLY using the verified rules and interpretation guidance provided below
2. DO NOT use information from any other sources, including the internet or personal knowledge.
3. if an action is not explicitly prohibited in the rules, assume it is allowed.
4. if an action is not explicity mentioned, assume it is allowed.
5. if an action is explicitly stated in the rule, follow what the rule says.
6. A disqualification must be based on a specific rule violation and cannot be implied.
7. If the information isn't in the rules, say "I don't have that rule in my current documentation"
8. Keep answers concise but complete
9. Reference specific rule numbers when possible.

Relevant swimming rules:
{context}

Rule interpretation guidance:
{guidance}

Question: {question}

Answer:"""

SWIMMING_RULES_PROMPT = PromptTemplate(
    template=swimming_rules_template,
    input_variables=["context", "guidance", "question"]
)


In [4]:
# reaad situations from json file
with open("../raw_data/swimming_situations2.json") as f:
    swimming_situations = json.load(f)

print(f"# of situations: {len(swimming_situations)}")

# of situations: 56


In [5]:
# Initialize embeddings model (ensure the same model used during creation)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# retretieve the relevant rules from your database or knowledge base
# Load the existing Chroma vector store
if len(VECTORDB) > 0:
    persist_directory = "../db/swim_rules" + "_" + VECTORDB
else:
    persist_directory = "../db/swim_rules" 
vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

# Create a retriever from the vector store
retriever = vector_store.as_retriever(search_kwargs={"k": 10})

# retrieve rule interpretation guidance
with open("../raw_data/interpretation_guidance.txt") as f:
    rule_interpretation_guidance = f.read()



In [6]:
# Initialize LLM
llm = ChatOpenAI(
    model=RAG_LLM,
    temperature=0,
    max_tokens=1000
)


In [7]:
print(f"Swimming situations: {len(swimming_situations)}")
for situation in swimming_situations[:5]:
    print(f"Situation: {situation}")

Swimming situations: 56
Situation: {'situation_id': 9, 'prompt': 'A breaststroke swimmer moves their hands in a sculling or flipper movement at the end of the first arm stroke, both after the start and after the turn. Should they be disqualified?', 'target': 'No, the swimmer should not be disqualified. This is legal provided, "All movements of the arms shall be simultaneous without alternating movement." A sculling motion of the hands, only as a part of the arm stroke, is not considered the beginning of a new stroke.', 'rule': '101.2.2'}
Situation: {'situation_id': 10, 'prompt': 'In a 9-10 100 yard breaststroke event, a swimmer completes 50 yards and, thinking that the race is over, pushes back from the wall to read the scoreboard. At this point, realizing that the race is only halfway over, the swimmer returns to the wall, pushes off on the breast, and completes the required distance in good form. What call, if any, should be made?', 'target': 'The swimmer left the wall on the back an

In [8]:
# setup evaluator
response_evaluator = create_llm_as_judge(
    prompt=CORRECTNESS_PROMPT,
    model="openai:gpt-4o",
    feedback_key="correctness"
)

In [9]:


if RANDOM_SAMPLE_SIZE:
    # Set random seed for reproducibility
    random.seed(42)

    # Take a random sample of swimming situations
    sampled_situations = random.sample(swimming_situations, RANDOM_SAMPLE_SIZE)
else:
    # Use the entire dataset if no sample size is specified
    sampled_situations = swimming_situations

test_results = []
for i, situation in enumerate(sampled_situations):
    query = situation["prompt"]
    test_case = {"question": query, "situation": situation["situation_id"]}
    print()
    print(textwrap.fill(f"\n>>>SITUATION: {situation['situation_id']} QUERY: {query}", width=80))
    
    # Use the retriever to get relevant documents
    docs = retriever.get_relevant_documents(query)

    # Combine the retrieved documents into a context string
    context = "\n".join([doc.page_content for doc in docs])

    # Create the prompt template
    prompt = SWIMMING_RULES_PROMPT.format(context=context, guidance=rule_interpretation_guidance, question=query)

    start_time = time.time()
    response = llm(prompt)
    test_case["query_time"] = time.time() - start_time

    start_time = time.time()
    correctness_score = response_evaluator(inputs=query, outputs=response.content, reference_outputs=situation["target"])
    test_case["response_eval_time"] = time.time() - start_time

    test_case.update({
        "llm_response": response.content,
        "correctness_score": correctness_score["score"],
        "feedback": correctness_score["comment"],
        "target": situation["target"],
        "vector_db": VECTORDB,
        "vector_db_path": persist_directory,
        "llm_model": RAG_LLM,
    })

    # Word wrap the response content if the line is longer than 80 characters
    print(f"completed {i+1} of {len(sampled_situations)}")

    test_results.append(test_case)



 >>>SITUATION: 9 QUERY: A breaststroke swimmer moves their hands in a sculling
or flipper movement at the end of the first arm stroke, both after the start and
after the turn. Should they be disqualified?


  docs = retriever.get_relevant_documents(query)
  response = llm(prompt)


completed 1 of 56

 >>>SITUATION: 10 QUERY: In a 9-10 100 yard breaststroke event, a swimmer
completes 50 yards and, thinking that the race is over, pushes back from the
wall to read the scoreboard. At this point, realizing that the race is only
halfway over, the swimmer returns to the wall, pushes off on the breast, and
completes the required distance in good form. What call, if any, should be made?
completed 2 of 56

 >>>SITUATION: 11 QUERY: In a 9-10 100 yard breaststroke event, a swimmer
completes 50 yards, and, thinking that the race is over, stands on the bottom of
the pool to read the scoreboard. At this point, realizing that the race is only
halfway over, the swimmer pushes off the wall on the breast and completes the
required distance in good form. What call, if any, should be made?
completed 3 of 56

 >>>SITUATION: 12 QUERY: In a senior 100 yard breaststroke event, the turn judge
calls a disqualification because the swimmer in lane 2 was not on their breast
when leaving the w

In [10]:
test_results_df = pd.DataFrame(test_results)
test_results_df

Unnamed: 0,question,situation,query_time,response_eval_time,llm_response,correctness_score,feedback,target,vector_db,vector_db_path,llm_model
0,A breaststroke swimmer moves their hands in a ...,9,0.979301,2.770431,"Yes, the breaststroke swimmer should be disqua...",False,The output claims that a breaststroke swimmer ...,"No, the swimmer should not be disqualified. Th...",semantic,../db/swim_rules_semantic,gpt-3.5-turbo
1,"In a 9-10 100 yard breaststroke event, a swimm...",10,0.916406,6.243926,The swimmer should be disqualified for not tou...,False,Upon evaluating the output against the rubric ...,The swimmer left the wall on the back and then...,semantic,../db/swim_rules_semantic,gpt-3.5-turbo
2,"In a 9-10 100 yard breaststroke event, a swimm...",11,1.331493,1.842237,The swimmer should be disqualified for not com...,False,"Upon evaluating the output, the response claim...","No call should be made. The rule states, “It i...",semantic,../db/swim_rules_semantic,gpt-3.5-turbo
3,"In a senior 100 yard breaststroke event, the t...",12,1.225352,5.72858,"Yes, the swimmer in lane 2 should be disqualif...",False,1. The primary point of analysis here involves...,The swimmer should not be disqualified. The ru...,semantic,../db/swim_rules_semantic,gpt-3.5-turbo
4,When a swimmer left the wall at the first turn...,13,1.193319,4.769522,"Yes, the swimmer should have been disqualified...",True,The output provided is evaluated as follows:\n...,"Yes. The rule states, “After the start and aft...",semantic,../db/swim_rules_semantic,gpt-3.5-turbo
5,"After a turn in the 100 yard breaststroke, a s...",14,1.204785,5.550017,"Yes, the swimmer should be disqualified. Accor...",False,The provided output suggests disqualification ...,No. The rule states “All movements of the arms...,semantic,../db/swim_rules_semantic,gpt-3.5-turbo
6,A swimmer swims the breaststroke in a way that...,15,1.534114,5.275572,"No, the coach is not correct. According to FIN...",True,The output is factually accurate and adheres w...,"No, the coach is not. The rule states, “The ha...",semantic,../db/swim_rules_semantic,gpt-3.5-turbo
7,To get further distance out of their breaststr...,16,1.021562,4.098391,"No, bringing the hands all the way down to the...",True,The model's output aligns well with the given ...,"It is not legal. The rule states, “The hands s...",semantic,../db/swim_rules_semantic,gpt-3.5-turbo
8,A swimmer in the 100 yard butterfly moves too ...,37,1.113297,3.431042,"Yes, this is a valid disqualification. Accordi...",True,The model output correctly identifies that the...,"This is a correct call. The rule states, “Both...",semantic,../db/swim_rules_semantic,gpt-3.5-turbo
9,"Near the finish of a butterfly race, a swimmer...",38,1.184372,7.475269,"Yes, this action is legal. According to FINA r...",False,"In evaluating the output, I first examined the...","No, this is not legal. The swimmer executed an...",semantic,../db/swim_rules_semantic,gpt-3.5-turbo


In [11]:
value_counts = test_results_df['correctness_score'].value_counts(normalize=True).mul(100)
value_counts

correctness_score
False    57.142857
True     42.857143
Name: proportion, dtype: float64

In [12]:
# save the dataframe to a pickle file
with open(f"../results/test_results_df_{VECTORDB}_{RAG_LLM}.pkl", "wb") as f:
    pickle.dump(test_results_df, f)