In [None]:
# File just for internal testing of the model and benchmarking with predefined questions. Runs the process with 140 questions.

import re
import json
from utils.model_utils import answer_with_subqueries

with open("./data/clean-all-printings.json", "r", encoding="utf-8") as f:
    all_cards = json.load(f)

with open("./data/easy-questions.json", "r", encoding="utf-8") as f:
    easy_questions = json.load(f)
with open("./data/hard-questions.json", "r", encoding="utf-8") as f:
    hard_questions = json.load(f)
with open("./data/extra-questions.json", "r", encoding="utf-8") as f:
    extra_questions = json.load(f)


all_questions = easy_questions + hard_questions + extra_questions


In [None]:
correct_answers = []
judge_ruling_conflict_questions = []
unclear_questions = []
incorrect_questions = []

for question in all_questions:

    # extract card names from question text between brackets using regex and remove duplicates
    card_names = list(set(re.findall(r'\[([^\]]+)\]', question["text"])))

    cards_info = []

    # if card_name exists in all_cards, append the card its info to cards_info
    for card_name in card_names:
        matching_cards = [card for card in all_cards if card_name.lower() == card["name"].lower()]
        if matching_cards:
            cards_info.extend(matching_cards)
        else:
            print(f"Card {card_name} not found in database!")

    response = answer_with_subqueries(question["text"], cards_info)

    gold = question["answer"].strip().lower()
    pred = response["single_word_answer"].strip().lower()
    if pred == "depends":
        pred = "yes" # asume depends means yes for the sake of this benchmark

    if pred == gold:
        correct_answers.append({"question": question, "response": response})
    elif pred == "denied":
        judge_ruling_conflict_questions.append({"question": question, "response": response})
    elif pred == "unclear":
        unclear_questions.append({"question": question, "response": response})
    else:
        incorrect_questions.append({"question": question, "response": response})

total_question_length = len(all_questions)

# Final Benchmark Test Results

## Test 1

- MAX_SUBQUERIES = 5
- TOP_K = 6
- 28 minutes
- 2nd Judge callings: 9
- correct answers: 125/140
- unclear questions: 2/140
- incorrect answers: 13/140

## Test 2

- MAX_SUBQUERIES = 5
- TOP_K = 4
- MODEL_HIGH_TEMPERATURE = 0.3
- MODEL_LOW_TEMPERATURE = 0
- MAX_CONTENT_CHUNKS = 15
- 2nd Judge callings: 7
- correct answers: 132/140
- unclear questions: 0/140
- incorrect answers: 8/140

## Test 3

- MAX_SUBQUERIES = 5
- TOP_K = 4
- MODEL_HIGH_TEMPERATURE = 0.3
- MODEL_LOW_TEMPERATURE = 0s
- MAX_CONTENT_CHUNKS = 15
- 2nd Judge callings: 7
- correct answers: 133/140
- unclear questions: 1/140
- incorrect answers: 6/140

## Test 4 (system prompt tweeks)

- MAX_SUBQUERIES = 5
- TOP_K = 8
- MODEL_HIGH_TEMPERATURE = 0.3
- MODEL_LOW_TEMPERATURE = 0
- MAX_CONTENT_CHUNKS = 25
- 2nd Judge callings: 5
- correct answers: 135/140
- unclear questions: 0/140
- incorrect answers: 5/140
- time: 35m

In [None]:
print(f"correct answers: {len(correct_answers)}/{total_question_length}")

print(f"judge ruling conflict: {len(judge_ruling_conflict_questions)}/{total_question_length}")
# for question in judge_ruling_conflict_questions:
#     print(question)

print(f"unclear questions: {len(unclear_questions)}/{total_question_length}")
# for question in unclear_questions:
#     print(question)

print(f"incorrect answers: {total_question_length - len(correct_answers) - len(judge_ruling_conflict_questions) - len(unclear_questions)}/{total_question_length}")
# for question in incorrect_questions:
#     print(question)



In [None]:
for question in incorrect_questions:
    print("original question:", question["question"]["text"])
    print("correct answer: ", question["question"]["answer"])
    print("reworded question: ", question["response"]["question"])
    print("single_word_answer: ", question["response"]["single_word_answer"])
    print("short_answer: ", question["response"]["short_answer"])
    print("full_explanation: ", question["response"]["full_explanation"])
    print("sources: ", question["response"]["sources"])
    print("-------------------------")

## Common Question Issues:

- blockers and bounce effects
- concept of loops (automatic infinite or triggered)
- golden rule priority
- summoning sickness persisting even through exile/bounce effects
- invoking a creature and puting it on the battlefield
- player on 0 life and State Based Actions


### Potential improvements

- Improving embedding chunks by rule sections
- Adding card Oracle text for subqueries generation
- Creating a dictionary that suggests MTG key concepts based on terms used in query, card text or card type
  - for example:
    - attack, block, combat should point to: combat phase, summoning sickness, turn structure
    - instants spells should point to: stack, priority.
    - cards with abilities should point to: triggered abilities, stack.
- When response deems que question unclear, the model can ask the user for info one extra time.
