In [1]:
import os
from pathlib import Path

In [2]:
# Configuration

# Model
faiss_embedding_model_name = 'jinaai/jina-embeddings-v2-base-en'
model_name = "TheBloke/Mistral-7B-Instruct-v0.1-AWQ"

# Text splitting settings
chunk_size = 1000
chunk_overlap = 200

# Data
data_path = Path("./munchkin_rules/")

In [3]:
# nltk is used for PDF processing. Here we ensure anything it downloads goes to
# the cache folder, so it doesn't have to download again
nltk_data_path = Path("~/.cache/nltk_data").expanduser()
nltk_data_path.mkdir(parents=True, exist_ok=True)
os.environ["NLTK_DATA"] = str(nltk_data_path)

In [4]:
# Deps for PDF parsing
!pip install "unstructured[pdf]"
!sudo apt-get install -y poppler-utils tesseract-ocr

# I can't even remember why we need this one
!pip install sentence-transformers
!pip install openai

Defaulting to user installation because normal site-packages is not writeable
Collecting unstructured[pdf]
  Downloading unstructured-0.11.2-py3-none-any.whl.metadata (25 kB)
Collecting chardet (from unstructured[pdf])
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting filetype (from unstructured[pdf])
  Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Collecting python-magic (from unstructured[pdf])
  Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)
Collecting lxml (from unstructured[pdf])
  Downloading lxml-4.9.3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Collecting nltk (from unstructured[pdf])
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting tabulate (from unstructured[pdf])
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Collecting emoji (from unstructured[pdf])
  Down

In [5]:
import re
from langchain.chains import LLMChain
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.llms import VLLMOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
# main_rules = data_path / "munchkin_rules-1.pdf"    
# loader = UnstructuredPDFLoader(main_rules, mode="elements", strategy="ocr_only")
# result2 = loader.load()

In [7]:
def has_sentence_end(s):
    s = s.strip()
    enders = ['.', '?', '!', '.)', '?)', '!)', '...']
    return any(s.endswith(ender) for ender in enders)


def fix_dangling_sentences(elements):
    """Fixes sentences that are split across multiple elements"""
    sequence = []
    for el in elements:
        previous = sequence[-1] if sequence else None
        if el.metadata["category"] == "NarrativeText" or has_sentence_end(el.page_content):
            if previous and previous.metadata["category"] == "NarrativeText":
                if not has_sentence_end(previous.page_content):
                    sequence[-1].page_content += " " + el.page_content
                    continue
        sequence.append(el)
    return sequence


def organize_into_sections(elements):
    sections = [[]]
    for element in elements:
        if element.metadata["category"] == "Title":
            sections.append([element])
        else:
            sections[-1].append(element)
    sections = list(filter(None, sections))
    return sections


def build_string_from_sections(sections):
    return "\n\n".join(
        "\n".join(element.page_content for element in section)
        for section in sections
    )


def load_pdf(path):
    loader = UnstructuredPDFLoader(path, mode="elements", strategy="ocr_only")
    result = loader.load()
    result = fix_dangling_sentences(result)
    sections = organize_into_sections(result)
    text = build_string_from_sections(sections)
    metadata = loader._get_metadata()
    docs = [Document(page_content=text, metadata=metadata)]
    return docs

In [8]:
rule_docs = []
for filename in data_path.glob("*.pdf"):
    print(f"Processing {filename}")
    rule_docs.extend(load_pdf(filename))

Processing munchkin_rules/munchkin_rules-1.pdf


  from .autonotebook import tqdm as notebook_tqdm


Processing munchkin_rules/puppies-rules.pdf
Processing munchkin_rules/princesses_rules.pdf
Processing munchkin_rules/munch_4_rules_20thp.pdf


In [9]:
for doc in rule_docs:
    if str(doc.metadata["source"]).endswith("munchkin_rules-1.pdf"):
        print(doc.page_content)

MUNCHKIN brings you the essence of the dungeon-crawling experience . .. without all that messy roleplaying!
This game includes 168 cards, one six-sided die, and these rules. Three to six can play. You will need 10 tokens (coins, poker chips, whatever - or any gadget that counts to 10) for each player.

SETUP
Divide the cards into the Door deck and the Treasure deck. Shuffle both decks. Deal four cards from each deck to each player.

CARD MANAGEMENT
Keep separate face-up discard piles for the two decks. You may not look through the discards unless you play a card that allows you to! When a deck runs out, reshuffle its discards.
In Play: These are the cards on the table in front of you, showing your Race and Class (if any) and the Items you are carrying. Continuing Curses and some other cards also stay on the table after you play them. Cards in play are public information and must be visible to the other players.

Conflicts Between Cards and Rules
This rulesheet gives the general tules. 

In [10]:
def add_chunk_metadata(documents):
    last_source = None
    idx = 0
    for doc in documents:
        if doc.metadata["source"] != last_source:
            idx = 0
            last_source = doc.metadata["source"]
        doc.metadata["chunk_index"] = idx
        idx += 1

In [11]:
# Chunk text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
chunked_documents = text_splitter.split_documents(rule_docs)
add_chunk_metadata(chunked_documents)

In [12]:
prompt = """[INST]
Create {num_questions} questions and succinct answers using only the context provided, which is a passage from a board game rulebook. Answer as though answering a question from a player who does not have access to the rulebook. Do not prefix questions or answers with any text, numbers, punctuation, or bullet points.
End each question with a '?' character and then in a newline write the answer to that question using only the context provided. Separate each question/answer pair by a newline.

Context:
{context}
[/INST] """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt
)

llm = VLLMOpenAI(
    openai_api_key="EMPTY",
    openai_api_base="http://localhost:8000/v1",
    temperature=0.2,
    # model_kwargs=dict(repetition_penalty=1.1),
    max_tokens=2_000,
    model_name=model_name,
    frequency_penalty=0.2,
)

# Create llm chain 
llm_chain = LLMChain(llm=llm, prompt=prompt)

In [13]:
def strip_qa_prefix(s):
    s = s.strip()

    # Remove any static prefix
    static_prefixes = ["Q:", "A:", "question:", "answer:", "Question:", "Answer:", "- ", "* "]
    for prefix in static_prefixes:
        if s.startswith(prefix):
            s = s[len(prefix):]
            s = s.strip()
    
    # Remove enumerations
    if re.match(r"^\d+\.", s):
        _, item = s.split(".", 1)
        if item != '':
            s = item.strip()

    return s.strip()


def parse_qa_pairs(llm_output, expected_num_qa_pairs):
    qa_pairs = []
    question = None
    answer = None

    segments = []
    for line in llm_output.splitlines():
        line = line.strip()
        if not line:
            continue
        if line.endswith("?"):
            segments.append([line])
        elif not segments:
            # The first line wasn't a question
            raise ValueError(f"Answer without question: {line}")
        else:
            # This isn't a question so it must be part of the answer
            segments[-1].append(line)
    
    for [question, *answer] in segments:
        answer = " ".join(answer)
        answer = answer.strip()
        if not answer:
            raise ValueError(f"Question without answer: {question}")
        qa_pairs.append((question, answer))
    
    if len(qa_pairs) != expected_num_qa_pairs:
        raise ValueError(f"Expected {expected_num_qa_pairs} QA pairs, got {len(qa_pairs)}")
    
    qa_pairs = [(strip_qa_prefix(q), strip_qa_prefix(a)) for q, a in qa_pairs]
    return qa_pairs

In [14]:
def with_retries(fn, max_retries=5):
    exc = None
    for i in range(max_retries):
        try:
            return fn()
        except Exception as e:
            exc = e
    raise exc

In [15]:
def create_questions_and_answers(llm_chain, documents, num_questions):
    qa = []
    for doc in documents:
        llm_output = llm_chain.invoke(
            {"context": doc.page_content, "num_questions": 3}
        )
        text = llm_output["text"]
        try:
            qa_pairs = with_retries(
                lambda: parse_qa_pairs(text, num_questions), 
                max_retries=3
            )
        except:
            print("Error parsing QA pairs for:")
            print(f"source: {doc.metadata['source']}")
            print(f"chunk_index: {doc.metadata['chunk_index']}")
            continue
        for q, a in qa_pairs:
            qa.append(
                dict(
                    question=q,
                    answer=a,
                    source=doc.metadata["source"],
                    chunk_index=doc.metadata["chunk_index"],
                )
            )
    return qa

In [16]:
qa_list = create_questions_and_answers(llm_chain, chunked_documents, 3)

Error parsing QA pairs for:
source: munchkin_rules/princesses_rules.pdf
chunk_index: 2


In [17]:
for qa in qa_list:
    print(qa["question"])
    print(qa["answer"])
    print()

How many players can play MUNCHKIN?
3 to 6 players can play MUNCHKIN.

What are the requirements to play MUNCHKIN?
To play MUNCHKIN, you need 10 tokens (coins, poker chips, or any gadget that counts to 10), 168 cards, one six-sided die, and these rules.

How is the game set up for MUNCHKIN?
In the setup for MUNCHKIN, divide the cards into the Door deck and the Treasure deck, shuffle both decks, deal four cards from each deck to each player, and set up separate face-up discard piles for the two decks. You may not look through the discards unless you play a card that allows you to. When a deck runs out, reshuffle its discards. Keep the cards in play visible to the other players as they are public information.

Can a card reduce a player's combat strength below Level 1?
No, according to rule iL, nothing can reduce a player below Level 1. However, a card effect might reduce a player's or a monster's combat strength below Level 1.

When can a player go up a level after combat?
A player goes

In [18]:
parse_qa_pairs(llm_chain.invoke({"context": chunked_documents[18].page_content, "num_questions": 3})["text"], 3)

[('What is the requirement to go up one level in the game?',
  'Discard Items worth a total of at least 1,000 Gold Pieces and immediately go up one level.'),
 ('Can I sell Items to go to Level 10?',
  'No, you may not sell Items to go to Level 10.'),
 ('How many levels can I go up at once if I discard 2,000 Gold Pieces worth of Items?',
  'You can go up two levels at once if you discard 2,000 Gold Pieces worth of Items.')]