In [1]:
# https://medium.com/@thakermadhav/build-your-own-rag-with-mistral-7b-and-langchain-97d0c92fa146

In [2]:
import os
import json
from pathlib import Path

In [3]:
# Configuration

# FAISS
faiss_gpu = False
faiss_embedding_model_name = 'jinaai/jina-embeddings-v2-base-en'
retrieve_topk = 6

# Text splitting settings
chunk_size = 1000
chunk_overlap = 200

# Rag settings
consistency_samples = 5

# Quantization settings
quantization_enabled = False
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = True

# Model
# model_name='mistralai/Mistral-7B-Instruct-v0.1'
model_name='teknium/OpenHermes-2.5-Mistral-7B'
# model_name='Intel/neural-chat-7b-v3-1'
# model_name='rishiraj/CatPPT'
# model_name = 'kyujinpy/Sakura-SOLAR-Instruct'

grader_model_name = 'cognitivecomputations/dolphin-2_6-phi-2'

# Data
data_path = Path("./munchkin_rules/")

device = "cuda"

In [4]:
# nltk is used for PDF processing. Here we ensure anything it downloads goes to
# the cache folder, so it doesn't have to download again
nltk_data_path = Path("~/.cache/nltk_data").expanduser()
nltk_data_path.mkdir(parents=True, exist_ok=True)
os.environ["NLTK_DATA"] = str(nltk_data_path)

In [5]:
!pip install chromadb

Defaulting to user installation because normal site-packages is not writeable



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [6]:
# Deps for PDF parsing
!pip install "unstructured[pdf]"
!sudo apt-get install -y poppler-utils tesseract-ocr

# I can't even remember why we need this one
!pip install sentence-transformers

# For phi-2
!pip install einops

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 65 not upgraded.
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Defaulting to user i

In [7]:
from typing import Optional, Tuple

import torch
import transformers

from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.chains import LLMChain
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.prompts import PromptTemplate, StringPromptTemplate
from langchain.retrievers import ParentDocumentRetriever
from langchain.schema import AIMessage
from langchain.schema.runnable import RunnablePassthrough
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import VectorStore
from langchain_community.vectorstores.chroma import Chroma
from langchain_core.language_models import BaseChatModel
from langchain_core.runnables import RunnableLambda
from langchain_core.runnables import Runnable
from langchain_core.runnables import RunnableBranch
from transformers import (
    PreTrainedModel, 
    PreTrainedTokenizerBase,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoModel,
)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
%load_ext autoreload
%autoreload 2
from util import HuggingFaceChatModel, VectorStoreRetrieverWithTextSplitter, parse_pdf

In [9]:
print(f"{transformers.__version__=}")
print(f"{torch.__version__=}")
print(f"{torch.version.cuda=}")
print(f"{torch.cuda.is_available()=}")
print(f"{torch.cuda.device_count()=}")

if "cuda" in device:
    assert torch.cuda.is_available(), "CUDA is not available"

transformers.__version__='4.36.0'
torch.__version__='2.1.1+cu118'
torch.version.cuda='11.8'
torch.cuda.is_available()=True
torch.cuda.device_count()=1


In [10]:
!nvidia-smi

Sat Jan  6 19:51:11 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:07:00.0  On |                  N/A |
|  0%   60C    P5              46W / 420W |    742MiB / 24576MiB |     30%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Build Chat Model

In [11]:
def load_transformers_model(model_name:str, bnb_config:Optional[BitsAndBytesConfig]=None) -> Tuple[PreTrainedModel, PreTrainedTokenizerBase]:
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True,
        device_map="auto"
    )
    if tokenizer.pad_token is None:
        print("Setting pad token")
        # For some reason, this isn't set in the config. For Mistral, it's just
        # the EOS token (which is the default). However, with OpenHermes, the
        # EOS token is a different token, but the padding token appears to still
        # be </s>:
        #
        # https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/special_tokens_map.json
        #
        # So if it is not set, we just set it explicitly to </s> here.
        tokenizer.pad_token = '</s>'

    if bnb_config is not None:
        model_kwargs = {"quantization_config": bnb_config}
    else:
        model_kwargs = {"torch_dtype": torch.float16}

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        # OpenHermes has the KV-cache disabled by default (perhaps they setup the
        # defaults for fine-tuning?). Enabling it here because it is way way
        # faster.
        use_cache=True,
        trust_remote_code=True,
        **model_kwargs,
    )

    return (model, tokenizer)


compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

model, tokenizer = load_transformers_model(model_name, bnb_config)

print("BOS token:", tokenizer.bos_token)
print("EOS token:", tokenizer.eos_token)
print("PAD token:", tokenizer.pad_token)
print("UNK token:", tokenizer.unk_token)

# Example with standard completion
input_ids = tokenizer.encode("What is the capital of the U.S.?", return_tensors="pt").to(model.device)

results = model.generate(input_ids, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, max_new_tokens=100)
sequence = results[0]
sequence = tokenizer.decode(sequence)
print("Completion:")
print(sequence)
print()
print()

# Example using chat
messages = [
    # {
    #     "role": "system",
    #     "content": "You are a helpfule Q&A AI assistant."
    # },
    {
        "role": "user",
        "content": "What is the capital of the U.S.?"
    }
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

results = model.generate(input_ids, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, max_new_tokens=100)
sequence = results[0]
sequence = tokenizer.decode(sequence)
print("Chat:")
print(sequence)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Setting pad token


Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.88s/it]


BOS token: <s>
EOS token: <|im_end|>
PAD token: </s>
UNK token: <unk>
Completion:
<s> What is the capital of the U.S.?

Washington, D.C. is the capital of the United States. It is located on the East Coast, between the states of Maryland and Virginia.

What is the capital of France?

The capital of France is Paris. It is located in the northern part of the country and is known for its rich history, culture, and landmarks such as the Eiffel Tower and the Louvre Museum.

What is the capital of Canada?

The capital


Chat:
<|im_start|> user
What is the capital of the U.S.?<|im_end|> 
<|im_start|> assistant
The capital of the United States is Washington, D.C.<|im_end|>


In [12]:
messages = [
    # {
    #     "role": "system",
    #     "content": "You are a helpfule Q&A AI assistant."
    # },
    {
        "role": "user",
        "content": "Summarize the history of the U.S. in 200 words or less."
    }
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

results = model.generate(input_ids, temperature=0.7, do_sample=True, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, max_new_tokens=500, num_return_sequences=2)
sequence = results[0]
sequence = tokenizer.decode(sequence)
print(tokenizer.decode(results[0]))
print()
print()
print(tokenizer.decode(results[1]))

<|im_start|> user
Summarize the history of the U.S. in 200 words or less.<|im_end|> 
<|im_start|> assistant
The United States has a rich and complex history that spans over 250 years. The country was founded in 1776, when 13 British colonies declared their independence and formed the United States of America. The early years of the country were marked by expansion, with the acquisition of new territories through war and treaty. The Civil War, fought between 1861 and 1865, was a major event in American history, as it resulted in the abolition of slavery and the reunification of the country.

The early 20th century saw the United States emerge as a global superpower, with its economy and military might growing rapidly. The country played a leading role in both World War I and World War II, and established itself as a champion of democracy and human rights. In the post-war years, the United States became a major player in the Cold War, facing off against the Soviet Union in a struggle for

In [13]:
model.config

MistralConfig {
  "_name_or_path": "teknium/OpenHermes-2.5-Mistral-7B",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"
  },
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "

In [14]:
# Vanilla mistral:

# MistralConfig {
#   "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.1",
#   "architectures": [
#     "MistralForCausalLM"
#   ],
#   "attention_dropout": 0.0,
#   "bos_token_id": 1,
#   "eos_token_id": 2,
#   "hidden_act": "silu",
#   "hidden_size": 4096,
#   "initializer_range": 0.02,
#   "intermediate_size": 14336,
#   "max_position_embeddings": 32768,
#   "model_type": "mistral",
#   "num_attention_heads": 32,
#   "num_hidden_layers": 32,
#   "num_key_value_heads": 8,
#   "quantization_config": {
#     "bnb_4bit_compute_dtype": "float16",
#     "bnb_4bit_quant_type": "nf4",
#     "bnb_4bit_use_double_quant": true,
#     "llm_int8_enable_fp32_cpu_offload": false,
#     "llm_int8_has_fp16_weight": false,
#     "llm_int8_skip_modules": null,
#     "llm_int8_threshold": 6.0,
#     "load_in_4bit": true,
#     "load_in_8bit": false,
#     "quant_method": "bitsandbytes"
#   },
#   "rms_norm_eps": 1e-05,
#   "rope_theta": 10000.0,
#   "sliding_window": 4096,
#   "tie_word_embeddings": false,
#   "torch_dtype": "bfloat16",
#   "transformers_version": "4.36.2",
#   "use_cache": true,
#   "vocab_size": 32000
# }

# GenerationConfig {
#   "bos_token_id": 1,
#   "eos_token_id": 2
# }

# Hermes:

# MistralConfig {
#   "_name_or_path": "teknium/OpenHermes-2.5-Mistral-7B",
#   "architectures": [
#     "MistralForCausalLM"
#   ],
#   "attention_dropout": 0.0,
#   "bos_token_id": 1,
#   "eos_token_id": 32000,
#   "hidden_act": "silu",
#   "hidden_size": 4096,
#   "initializer_range": 0.02,
#   "intermediate_size": 14336,
#   "max_position_embeddings": 32768,
#   "model_type": "mistral",
#   "num_attention_heads": 32,
#   "num_hidden_layers": 32,
#   "num_key_value_heads": 8,
#   "quantization_config": {
#     "bnb_4bit_compute_dtype": "float16",
#     "bnb_4bit_quant_type": "nf4",
#     "bnb_4bit_use_double_quant": true,
#     "llm_int8_enable_fp32_cpu_offload": false,
#     "llm_int8_has_fp16_weight": false,
#     "llm_int8_skip_modules": null,
#     "llm_int8_threshold": 6.0,
#     "load_in_4bit": true,
#     "load_in_8bit": false,
#     "quant_method": "bitsandbytes"
#   },
#   "rms_norm_eps": 1e-05,
#   "rope_theta": 10000.0,
#   "sliding_window": 4096,
#   "tie_word_embeddings": false,
#   "torch_dtype": "bfloat16",
#   "transformers_version": "4.36.2",
#   "use_cache": false,
#   "vocab_size": 32002
# }

# GenerationConfig {
#   "bos_token_id": 1,
#   "eos_token_id": 32000,
#   "use_cache": false
# }

model.generation_config

GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 32000
}

In [15]:
from langchain.schema import SystemMessage, HumanMessage

chat_model = HuggingFaceChatModel(
    model=model,
    tokenizer=tokenizer,
    generate_kwargs={},
    max_tokens=500
)

chat_model.invoke(
    [
        SystemMessage(content="You are a helpful Q&A AI assistant."),
        HumanMessage(content="What is the capital of the U.S.?")
    ],
)

AIMessage(content='The capital of the United States is Washington, D.C. (District of Columbia).')

## Build Grader Chat Model

To create a level playing field when comparing model performance, we always use
the same language model for grading. We load this model now before we start
hogging all the vram.

In [18]:
grader_model, grader_tokenizer = load_transformers_model(grader_model_name, bnb_config)

grader_chat_model = HuggingFaceChatModel(
    model=grader_model,
    tokenizer=grader_tokenizer,
    generate_kwargs={},
    max_tokens=500
)

grader_chat_model.invoke(
    [
        SystemMessage(content="You are a helpful Q&A AI assistant."),
        HumanMessage(content="What is the capital of the U.S.?")
    ],
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.52s/it]
Some weights of the model checkpoint at cognitivecomputations/dolphin-2_6-phi-2 were not used when initializing PhiForCausalLM: ['lm_head.linear.lora_A.default.weight', 'lm_head.linear.lora_B.default.weight']
- This IS expected if you are initializing PhiForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing PhiForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AIMessage(content="The capital of the United States is Washington, D.C. It is a federal district and the seat of the federal government of the United States. It is located on the east bank of the Potomac River across from the District of Columbia's western border with Maryland. The city is divided into two parts: the Federal City, which is the area around the White House and the U.S. Capitol, and the Georgetown neighborhood, which is the area around the U.S. Naval Observatory and the U.S. Army's Fort Lesley J. McNair. Washington, D.C. is the fourth-smallest capital city in the world by area.")

## Build Retriever

In [19]:
def load_docs():
    rule_docs = []
    for filename in data_path.glob("*.pdf"):
        print(f"Processing {filename}")
        rule_docs.extend(parse_pdf(filename))
    return rule_docs

In [20]:
rule_docs = load_docs()

Processing munchkin_rules/munchkin_rules-1.pdf
Processing munchkin_rules/puppies-rules.pdf
Processing munchkin_rules/princesses_rules.pdf
Processing munchkin_rules/munch_4_rules_20thp.pdf


In [21]:
total_rule_tokens = sum(len(tokenizer.tokenize(doc.page_content)) for doc in rule_docs)
print(f"Total doc tokens: {total_rule_tokens:,}")

Total doc tokens: 11,641


In [22]:
def load_embedding_model(model_name:str) -> HuggingFaceEmbeddings:
    # We first load the embedding model using AutoModel so that we can pass
    # trust_remote_code=True to install it, which we cannot do with 
    # HuggingFaceEmbeddings (https://github.com/langchain-ai/langchain/issues/6080)
    _ = AutoModel.from_pretrained(model_name, trust_remote_code=True)

    embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={'device': 'cpu'})
    return embedding_model

In [23]:
def build_vectorstore(embedding_model:HuggingFaceEmbeddings) -> VectorStore:
    db = Chroma(embedding_function=embedding_model)
    return db

In [24]:
# Retriever config dataclass
from dataclasses import dataclass
import math

# @dataclass
# class RetrieverConfig:
#     max_context_size:int = 4096
#     percent_context_use:float = 0.5
#     parent_percent:float = 0.25
#     parent_overlap_percent:float = 0.1
#     child_percent:float = 0.25
#     child_overlap_percent:float = 0.1
#     retrieve_extra_results_percent:float = 0.0

@dataclass
class RetrieverConfig:
    parent_chunk_size:int = 500
    parent_chunk_overlap:int = 50
    child_chunk_size:int = 125
    child_chunk_overlap:int = 12
    k:int = 10



def build_retriever(
    tokenizer:PreTrainedTokenizerBase,
    vectorstore:VectorStore,
    config:RetrieverConfig,
):
    parent_chunk_size = config.parent_chunk_size
    parent_chunk_overlap = config.parent_chunk_overlap
    parent_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer,
        chunk_size=parent_chunk_size,
        chunk_overlap=parent_chunk_overlap
    )

    k = config.k

    search_kwargs={"k": k}

    child_chunk_size = config.child_chunk_size

    if child_chunk_size > 0:
        child_chunk_overlap = config.child_chunk_overlap
        child_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
            tokenizer,
            chunk_size=child_chunk_size,
            chunk_overlap=child_chunk_overlap
        )
        store = InMemoryStore()
        retriever = ParentDocumentRetriever(
            vectorstore=vectorstore,
            docstore=store,
            child_splitter=child_splitter,
            parent_splitter=parent_splitter,
            search_kwargs=search_kwargs,
        )
    else:
        retriever = VectorStoreRetrieverWithTextSplitter(
            vectorstore=vectorstore,
            text_splitter=parent_splitter,
            search_kwargs=search_kwargs,
        )

    return retriever

db = build_vectorstore(load_embedding_model(faiss_embedding_model_name))
retriever = build_retriever(tokenizer, db, RetrieverConfig())
retriever.add_documents(rule_docs)

In [25]:
query = "Can I play a Go Up a Level card during combat?"
result = db.similarity_search(query)
print(f"Query: {query}")
print()
print(f"From vectorstore:")
print(f"Result count: {len(result)}")
print(f"Doc length: {len(result[0].page_content)}")
print(f"Content: {result[0].page_content[:100]}")
print()

result = retriever.invoke("Can I play a Go Up a Level card during combat?")
print(f"From retriever:")
print(f"Result count: {len(result)}")
print(f"Doc length: {len(result[0].page_content)}")
print(f"Content: {result[0].page_content[:100]}")

# print(f"{str(len(result)):<5.5} {len(result[0].page_content):<5.5} {result[0].page_content[:100]}")

Query: Can I play a Go Up a Level card during combat?

From vectorstore:
Result count: 4
Doc length: 423
Content: OTHER TREASURES
Other Treasure cards (like Go Up a Level cards) are not Items. Most of these cards s

From retriever:
Result count: 9
Doc length: 1017
Content: "ONE-SHOT” TREASURES
A Treasure card that says “Usable once only” is often called a “one-shot” Treas


## Sampling LLM Chain

For self consistency, we need a way to sample multiple results from the LLM.

In [26]:
@dataclass
class SamplingConfig:
    temperature:float = 0.7
    top_k:Optional[int] = 0
    top_p:Optional[float] = None
    samples:int = 1

def build_sampling_llm_chain(chat_model:BaseChatModel, config:SamplingConfig) -> Runnable:
    temperature = config.temperature
    top_k = config.top_k
    top_p = config.top_p
    n = config.samples
    kwargs = dict(temperature=temperature, top_k=top_k, top_p=top_p, n=n)
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
    
    return RunnableLambda(
        # lambda x: chat_model.batch([x]*n, temperature=temperature, top_k=top_k, top_p=top_p)
        lambda x: [
            AIMessage(content=g.text)
            for g in chat_model.generate([chat_model._convert_input(x).to_messages()], **kwargs).generations[0]
        ]
    ).with_config({"run_name": "chat-sampling"})

sampling_chain = build_sampling_llm_chain(chat_model, SamplingConfig(samples=3))
sampling_chain.invoke("What is the capital of the U.S.?")

[AIMessage(content='The capital of the United States is Washington, D.C.'),
 AIMessage(content='The capital of the United States is Washington, D.C.'),
 AIMessage(content='The capital of the United States is Washington, D.C.')]

In [27]:
from langchain.chains.conversational_retrieval.base import BaseConversationalRetrievalChain

## Basic RAG

In [28]:
from operator import itemgetter
from langchain_core.prompts import ChatPromptTemplate, BasePromptTemplate, format_document
from langchain.schema import Document
from functools import partial

document_prompt_template = """---
NAME: {source}
PAGE: {page_number}
PASSAGE:
{page_content}
---"""

DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(document_prompt_template)

rag_prompt_template = """\
Answer the question based only on the following context of the board game \
rules. Do not use any other information.

----
{context}
----

Question: {question}
Let's think step by step.
"""

DEFAULT_RAG_PROMPT = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert at board games. Board game players ask you questions about board games, and you provide succinct answers in your own words."),
        ("human", rag_prompt_template),
    ]
)

follow_up_prompt_template = """\
Now revise your answer. Be concise and remove any unrelated information to the question.
"""

follow_up_prompt = ChatPromptTemplate.from_messages(
    [
        ("assistant", "{answer}"),
        ("human", follow_up_prompt_template)
    ]
)

def _combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)

def build_basic_rag_chain(retriever_chain, chat_model, sampling_chain=None, prompt=DEFAULT_RAG_PROMPT, document_prompt=DEFAULT_DOCUMENT_PROMPT):
    if sampling_chain is None:
        sampling_chain = chat_model

    followup_chain = prompt+follow_up_prompt | chat_model

    def is_multiple_answers(x):
        return isinstance(x["answer"], list)
    
    def duplicate_dict_for_each_answer(x):
        return [{**x, "answer": answer} for answer in x["answer"]]

    followup_branch = RunnableBranch(
        (
            is_multiple_answers, 
            RunnableLambda(duplicate_dict_for_each_answer) | followup_chain.map()
        ),
        followup_chain,
    )

    # context_chain = itemgetter("question") | retriever | _combine_documents
    return (
        RunnablePassthrough.assign(
            documents=itemgetter("question") | retriever_chain
        )
        | RunnablePassthrough.assign(
            context=RunnableLambda(itemgetter("documents")) | partial(_combine_documents, document_prompt=document_prompt)
        )
        | RunnablePassthrough.assign(
            answer=RunnablePassthrough() | prompt | sampling_chain
        )
        | RunnablePassthrough.assign(
            answer=followup_branch
        )
    ).with_config({"run_name": "basic-rag-chain"})

In [29]:
basic_rag_chain = build_basic_rag_chain(retriever, chat_model.with_config({"callbacks": [ConsoleCallbackHandler()]}))
result = basic_rag_chain.invoke({"question": "Can I play a Go Up a Level card during combat?"})
print("Question:")
print(result["question"])
print()
print()
print("Context:")
print(result["context"])
print()
print()
print("Answer:")
print(result["answer"])

[32;1m[1;3m[llm/start][0m [1m[1:llm:HuggingFaceChatModel] Entering LLM run with input:
[0m{
  "prompts": [
    "System: You are an expert at board games. Board game players ask you questions about board games, and you provide succinct answers in your own words.\nHuman: Answer the question based only on the following context of the board game rules. Do not use any other information.\n\n----\n---\nNAME: munchkin_rules/munchkin_rules-1.pdf\nPAGE: 3\nPASSAGE:\n\"ONE-SHOT” TREASURES\nA Treasure card that says “Usable once only” is often called a “one-shot” Treasure. Most of these are used during combat to strengthen the munchkins or the monsters, and may be played from your hand or from the table. Some have other effects, however, so read the card carefully! Discard these cards as soon as the combat is over or their effect is resolved.\nOne-shot Items with a Gold Piece value may be sold for levels, just like other Items.\n\nOTHER TREASURES\nOther Treasure cards (like Go Up a Level card

## Universal Self-Consistency
https://arxiv.org/abs/2311.17311

In [30]:
from typing import List
from langchain.output_parsers import RegexParser

response_prompt_template = """{page_content}"""

consensus_prompt_template = """\
I have generated the following responses to the question: {question}

{context}

Evaluate these responses.
Select the most consistent response based on majority consensus.
Start your answer with "The most consistent response is Response X" (without \
quotes)
"""

DEFAULT_CONSENSUS_DOCUMENT_PROMPT = PromptTemplate.from_template(
    response_prompt_template
)

DEFAULT_CONSENSUS_PROMPT = ChatPromptTemplate.from_template(
    consensus_prompt_template
)

response_selection_parser = RegexParser(
    regex=r"(?i)response\s+(\d+)",
    output_keys=["response_selected_index"],
)

def convert_to_document(message: AIMessage) -> Document:
    return Document(
        page_content=message.content,
    )

def format_responses(responses:List[Document], document_prompt=DEFAULT_CONSENSUS_DOCUMENT_PROMPT, document_separator="\n\n") -> str:
    formatted = [f"Response {i}\n{format_document(doc, document_prompt)}" for i, doc in enumerate(responses)]
    return document_separator.join(formatted)

def build_universal_consistency_chain(chat_model:BaseChatModel, prompt=DEFAULT_CONSENSUS_PROMPT) -> Runnable:
    chat_model_consistency = chat_model.bind(temperature=0, max_tokens=1000)

    # chain that takes a list of responses and returns a formatted string combining them
    format_responses_chain = RunnableLambda(convert_to_document).map() | format_responses

    # chain that takes a question and a context and returns the index of the consensus response
    select_response_index_chain = prompt | chat_model_consistency | response_selection_parser | itemgetter(response_selection_parser.output_keys[0]) | int

    # chain that takes responses and a response_selected_index and returns the response at that index
    select_response_from_index_chain = RunnableLambda(lambda x: x["responses"][x["response_selected_index"]])

    # chain that takes a question and candiatate responses and returns the consensus response
    consistency_chain = (
        {"question": itemgetter("question"), "responses": itemgetter("responses"), "context": itemgetter("responses") | format_responses_chain}
        | RunnablePassthrough.assign(response_selected_index=select_response_index_chain)
        | select_response_from_index_chain
    )

    # chain that picks the first response if the consistency chain fails to
    # parse a response index
    fallback_chain = RunnableLambda(lambda x: x["responses"][0])

    return consistency_chain.with_fallbacks([fallback_chain], exceptions_to_handle=(ValueError, IndexError)).with_config({"run_name": "universal-consistency"})

In [31]:
basic_rag_chain = build_basic_rag_chain(retriever, chat_model, sampling_chain)
consistency_chain = build_universal_consistency_chain(chat_model)

result = basic_rag_chain.invoke({"question": "Can I play a Go Up a Level card during combat?"})
consistency_result = consistency_chain.invoke({
    "question": result["question"],
    "responses": result["answer"],
})

print("Question:")
print(result["question"])
print()
print()
print("Answers:")
print('\n\n'.join(answer.content for answer in result["answer"]))
print()
print()
print("Consensus answer:")
print(consistency_result.content)

Question:
Can I play a Go Up a Level card during combat?


Answers:
Yes, you can play a Go Up a Level card during combat.

Yes, you can play a Go Up a Level card during combat.

Yes, you can play a Go Up a Level card during combat.


Consensus answer:
Yes, you can play a Go Up a Level card during combat.


## RAG with Thread-of-Thought

https://arxiv.org/abs/2311.08734

In [32]:
from langchain_core.runnables import RunnableBranch

# https://arxiv.org/abs/2311.08734


# You are an AI assistant to help boardgame players find answers to their rules \
# questions.
# Answer the question based only on the following board game rule excerpts. Do \
# not use any other information. Never use the word "excerpts" in your answer. \
# Simply refer to the context as the rules.

# As a content reviewer, I provide multiple retrieved passages about this \
# question; you need to answer the question.

# If you don't know the answer, just say that you don't know, don't try to make \
# up an answer.

thread_of_thought_template = """\
You are an AI assistant to help board game players find answers to their rule \
questions.

Answer the question based only on the following board game rule \
excerpts. Do not use any other information. Never use the word "excerpts" in \
your answer. Simply refer to the context as the rules.

----
{context}
----

Q: {question}
Walk me through this context in manageable parts step by step, summarizing and \
analyzing as we go.
"""

DEFAULT_THREAD_OF_THOUGHT_PROMPT = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an AI assistant to help board game players find answers to their rule questions."),
        ("human", thread_of_thought_template),
    ]
)

DEFAULT_THEREFORE_PROMPT = ChatPromptTemplate.from_messages(
    [
        ("assistant", "{answer}"),
        ("human", "Now revise your answer. Be concise and remove any unrelated information to the question.")
    ]
)


def thread_of_thought_combine_documents(
    docs, 
    document_prompt=DEFAULT_DOCUMENT_PROMPT, 
    document_separator="\n"
):
    formatted = [
        f"{format_document(doc, document_prompt)}" 
        for i, doc in enumerate(docs)
    ]
    return document_separator.join(formatted)


def build_thread_of_thought_rag_chain(
    retriever_chain:Runnable, 
    chat_model:BaseChatModel, 
    sampling_chain=None, 
    prompt=DEFAULT_THREAD_OF_THOUGHT_PROMPT, 
    therefore_prompt=DEFAULT_THEREFORE_PROMPT
) -> Runnable:
    chat_model = chat_model.bind(temperature=0, max_tokens=1000)

    if sampling_chain is None:
        sampling_chain = chat_model
    
    # context_chain = retriever | thread_of_thought_combine_documents

    followup_chain = prompt+therefore_prompt | chat_model

    def is_multiple_answers(x):
        return isinstance(x["answer"], list)
    
    def duplicate_dict_for_each_answer(x):
        return [{**x, "answer": answer} for answer in x["answer"]]

    followup_branch = RunnableBranch(
        (
            is_multiple_answers,
            RunnableLambda(duplicate_dict_for_each_answer) | followup_chain.map()
        ),
        followup_chain,
    )

    return (
        RunnablePassthrough.assign(
            documents=itemgetter("question") | retriever_chain
        )
        | RunnablePassthrough.assign(
            context=(
                RunnableLambda(itemgetter("documents"))
                | thread_of_thought_combine_documents
            )
        )
        | RunnablePassthrough.assign(
            answer=(
                RunnablePassthrough()
                | prompt
                | sampling_chain
            )
        )
        | RunnablePassthrough.assign(
            answer=followup_branch
        )

    ).with_config({"run_name": "thread-of-thought"})

In [33]:
thread_of_thought_rag = build_thread_of_thought_rag_chain(
    retriever,
    chat_model.with_config({'callbacks': [ConsoleCallbackHandler()]})
)
result = thread_of_thought_rag.invoke(
    {"question": "Can I play a Go Up a Level card during combat?"}
)
print("Question:")
print(result["question"])
print()
print()
print("Context:")
print(result["context"])
print()
print()
print("Answer:")
print(result["answer"])

[32;1m[1;3m[llm/start][0m [1m[1:llm:HuggingFaceChatModel] Entering LLM run with input:
[0m{
  "prompts": [
    "System: You are an AI assistant to help board game players find answers to their rule questions.\nHuman: You are an AI assistant to help board game players find answers to their rule questions.\n\nAnswer the question based only on the following board game rule excerpts. Do not use any other information. Never use the word \"excerpts\" in your answer. Simply refer to the context as the rules.\n\n----\n---\nNAME: munchkin_rules/munchkin_rules-1.pdf\nPAGE: 3\nPASSAGE:\n\"ONE-SHOT” TREASURES\nA Treasure card that says “Usable once only” is often called a “one-shot” Treasure. Most of these are used during combat to strengthen the munchkins or the monsters, and may be played from your hand or from the table. Some have other effects, however, so read the card carefully! Discard these cards as soon as the combat is over or their effect is resolved.\nOne-shot Items with a Gold Pi

## Glue Code

In [34]:
from typing import Union

RetrieverWithAddDocuments = Union[ParentDocumentRetriever, VectorStoreRetrieverWithTextSplitter]

In [35]:
@dataclass
class RagChainConfig:
    rag_prompt:PromptTemplate = DEFAULT_RAG_PROMPT
    thread_of_thought_enabled:bool = True
    thread_of_thought_prompt:PromptTemplate = DEFAULT_THREAD_OF_THOUGHT_PROMPT
    thread_of_thought_therefore_prompt:PromptTemplate = DEFAULT_THEREFORE_PROMPT
    consensus_prompt:PromptTemplate = DEFAULT_CONSENSUS_PROMPT
    number_of_documents:int = 5


def build_rag_chain(
    chat_model:BaseChatModel,
    sampling_chain:Runnable,
    retriever:RetrieverWithAddDocuments,
    config:RagChainConfig,
) -> Runnable:
    retriever_chain = (
        retriever
        | RunnableLambda(lambda documents: documents[:config.number_of_documents])
    )

    if config.thread_of_thought_enabled:
        rag_chain = build_thread_of_thought_rag_chain(
            retriever_chain=retriever_chain,
            chat_model=chat_model,
            sampling_chain=sampling_chain,
            prompt=config.thread_of_thought_prompt,
            therefore_prompt=config.thread_of_thought_therefore_prompt,
        )
    else:
        rag_chain = build_basic_rag_chain(
            retriever_chain=retriever_chain,
            chat_chain=sampling_chain,
            prompt=config.rag_prompt,
        )
    consistency_chain = build_universal_consistency_chain(chat_model, prompt=config.consensus_prompt)

    consistency_chain = (
        {"question": itemgetter("question"), "responses": itemgetter("answer")}
        | consistency_chain
    )

    return (
        rag_chain
        | RunnablePassthrough.assign(
            answer=consistency_chain
        )
    )

In [36]:
def build_complete_chain(
        chat_model, 
        tokenizer, 
        vectorstore,
        retriever_config:RetrieverConfig, 
        sampling_config:SamplingConfig, 
        rag_config:RagChainConfig
) -> Tuple[RetrieverWithAddDocuments, Runnable]:
    retriever = build_retriever(tokenizer, vectorstore, retriever_config)
    sampling_chain = build_sampling_llm_chain(chat_model, sampling_config)
    rag_chain = build_rag_chain(chat_model, sampling_chain, retriever, rag_config)
    full_chain = {"question": RunnablePassthrough()} | rag_chain

    return retriever, full_chain

In [37]:
chat_model.callbacks = [ConsoleCallbackHandler()]

# Reset the vectorstore so we can add the documents again with the appropriate
# chunk sizes. Unfortunately, we can't just instantiate a new instance of Chroma
# as that seems to just create a new instance of the same vectorstore. Also, we
# cannot just delete the collection and then resuse the same instance, as then
# we will get an error about the collection not existing >:( so we have to both
# delete the collection and create a new instance. How annoying.
try:
    db.delete_collection()
except ValueError:
    # we might have already delete it. Good for us!
    pass

db = build_vectorstore(load_embedding_model(faiss_embedding_model_name))



retriever, complete_chain = build_complete_chain(
    chat_model=chat_model,
    tokenizer=tokenizer,
    vectorstore=db,
    retriever_config=RetrieverConfig(),
    sampling_config=SamplingConfig(
        temperature=0.7,
        samples=5,
    ),
    rag_config=RagChainConfig(
        thread_of_thought_enabled=True
    ),
)

retriever.add_documents(rule_docs)

query = "Can I play a Go Up a Level card during combat?"
result = complete_chain.invoke(query)

print("Question:")
print(result["question"])
print()
print()
print("Answer:")
print(result["answer"].content)

[32;1m[1;3m[llm/start][0m [1m[1:llm:HuggingFaceChatModel] Entering LLM run with input:
[0m{
  "prompts": [
    "System: You are an AI assistant to help board game players find answers to their rule questions.\nHuman: You are an AI assistant to help board game players find answers to their rule questions.\n\nAnswer the question based only on the following board game rule excerpts. Do not use any other information. Never use the word \"excerpts\" in your answer. Simply refer to the context as the rules.\n\n----\n---\nNAME: munchkin_rules/munchkin_rules-1.pdf\nPAGE: 3\nPASSAGE:\n\"ONE-SHOT” TREASURES\nA Treasure card that says “Usable once only” is often called a “one-shot” Treasure. Most of these are used during combat to strengthen the munchkins or the monsters, and may be played from your hand or from the table. Some have other effects, however, so read the card carefully! Discard these cards as soon as the combat is over or their effect is resolved.\nOne-shot Items with a Gold Pi

In [38]:
db.similarity_search("Can I play a Super Munchkin card without a class card?")

[Document(page_content='SUPER MUNCHKIN AND HALF-BREED\nThese cards may be played whenever it is legal to play a Class or Race, as long as you have an appropriate card (Class for Super Munchkin, Race for Half-Breed) to attach it to. You cannot have more than one of the same Class or Race card in play at once.', metadata={'doc_id': '44dc4108-b05c-4081-ad28-630d753bda67', 'page_number': 2, 'source': 'munchkin_rules/munchkin_rules-1.pdf'}),
 Document(page_content='If you play Super Munchkin while you have two Classes, you have all the normal advantages and disadvantages of both Classes. (All of the above is also true for', metadata={'doc_id': '44dc4108-b05c-4081-ad28-630d753bda67', 'page_number': 2, 'source': 'munchkin_rules/munchkin_rules-1.pdf'}),
 Document(page_content="You can discard a Class card at any time, even in combat: “I don't wanna be a wizard anymore.” When you discard a Class card, you become classless until you play another Class card.\nYou may not belong to more than one c

## Evaluating

In [39]:
test_cases = [
    {
        "query": "If a monster does not pursue me because my level is too low, can I still loot the room?",
        "answer": "No, you cannot loot the room."
    },
    {
        "query": "Can I sell items from my hand to go up a level, assuming I can sell 1,000 gold pieces worth?",
        "answer": "Yes. You can sell items from your hand to go up a level."
    },
    {
        "query": "If a hireling is removed from play due to Bad Stuff, does the player retain any items the hireling was carrying?",
        "answer": "No. When a hireling is removed from play due to bad stuff, any items the hireling was carrying are also removed from play."
    },
    {
        "query": "Can I have multiple steeds equipped at the same time?",
        "answer": "No. You can only have one steed equipped at a time."
    },
    {
        "query": "Can I play a Go Up a Level card during combat on my turn?",
        "answer": "Yes. You can play a Go Up a Level card at any time."
    },
    {
        "query": "How many players can join me in a combat?",
        "answer": "Only one player can join you in combat."
    },
    {
        "query": "Does a player retain their princess card in play if they die?",
        "answer": "Yes. A player retains their princess card in play if they die."
    },
    {
        "query": "Can I carry multiple big items so long as only one is equipped?",
        "answer": "No. You can only carry one big item at a time."
    },
    {
        "query": "What is an item in play but not equipped called?",
        "answer": "An item in play but not equipped is called a carried item."
    },
    {
        "query": "If after breaking down the door I draw a steed face up, what are my options?",
        "answer": "You can put the steed into your hand, equip it, or treat as a monster and fight it."
    },
    {
        "query": "When can I play a Super Munchkin card?",
        "answer": "You can play a Super Munchkin card whenever it is legal to play a Class card."
    },
    {
        "query": "Can I play a Super Munchkin card without a class card?",
        "answer": "No, you must have a class card to attach it to."
    },
    {
        "query": "What cards can I trade with other players?",
        "answer": "You can trade any item cards in play (on the table) with other players."
    },
    {
        "query": "When can I player a Hireling?",
        "answer": "At any time."
    },
    {
        "query": "Can I play a Hireling card if I already have a Hireling in play?",
        "answer": "No. You can only have one Hireling in play at a time."
    },
    {
        "query": "When I loot the room, is the door card drawn face up or face down?",
        "answer": "The door card is drawn face down."
    },
    {
        "query": "Can I use a card to compel another player to help me in combat if winning that combat would give me the winning level?",
        "answer": "No. You cannot compel another player to get the winning level."
    },
    {
        "query": "Can I play a Go Up a Level card on another player?",
        "answer": "Yes."
    },
    {
        "query": "Can I play a Curse while in combat?",
        "answer": "Yes. A curse may be played at any time."
    },
    {
        "query": "How can I get rid of my Class card?",
        "answer": "You can discard your Class card at any time."
    },
    {
        "query": "When can I discard a Race card?",
        "answer": "You can discard your Race card at any time."
    }
]

In [44]:
from langchain.evaluation.qa.eval_chain import QAEvalChain, CotQAEvalChain
from langchain_core.output_parsers.string import StrOutputParser
import copy

eval_template = """You are a teacher grading a quiz.
You are given a question, the student's answer, and the true answer, and are \
asked to score the student answer as either CORRECT or INCORRECT.
Write out in a step by step manner your reasoning to be sure that your \
conclusion is correct. Avoid simply stating the correct answer at the outset.
The student answer may contain additional information, it may also provide \
clarifications and exceptions to the true answer, so long as it does not \
contradict the true asnwer. If the student answer fails to provide an answer \
claiming that the rules are unclear or do not provide sufficient informatn, \
then the student answer receives a grade of incorrect.
There is no partial credit. Answers are either CORRECT or INCORRECT.
At the end, always output "GRADE: CORRECT" or "GRADE: INCORRECT" (without the \
quotes) to indicate your final conclusion on a line all by itself.

Example Format:
QUESTION: question here
STUDENT ANSWER: student's answer here
TRUE ANSWER: true answer here
EXPLANATION: step by step reasoning here
GRADE: CORRECT or INCORRECT here

Grade the student answers based ONLY on their factual accuracy with respect to \
the true answer. Ignore differences in punctuation and phrasing between the \
student answer and true answer. Begin! 

QUESTION: Can I carry multiple big items so long as only one is equipped?
STUDENT ANSWER: A player can carry multiple big items, including Steeds, but only one can be equipped. If a player loses the ability to have more than one big item, they must correct the issue or discard all but one big item.
TRUE ANSWER: No. You can only carry one big item at a time.
EXPLANATION: The true answer states that you cannot carry more than one big item at a time. The student answer states a player can carray multiple big items. This contradicts the true answer. Therefore, the student answer is incorrect.
GRADE: INCORRECT

QUESTION: How many players can join me in a combat?
STUDENT ANSWER: In Munchkin, during combat against a monster, other players can help by adding their combat strength. Multiple players can contribute monsters to a fight using the Wandering Monster card. The player fighting the monster must defeat the combined combat strength of all the joined monsters. However, the rules do not explicitly state a maximum number of players who can contribute monsters to join a combat.
TRUE ANSWER: Only one player can join you in combat.
EXPLANATION: The true answer says that only one other player can join someone in combat. The student answer claims there is no maxiumum to the number of players who can join combat. Therefore, the student answer is incorrect.
GRADE: INCORRECT

QUESTION: If a hireling is removed from play due to Bad Stuff, does the player retain any items the hireling was carrying?
STUDENT ANSWER: If a Hireling is removed from play due to Bad Stuff or any other circumstances, the items the Hireling was carrying are lost, not retained by the player.
TRUE ANSWER: No. When a hireling is removed from play due to bad stuff, any items the hireling was carrying are also removed from play.
EXPLANATION: The student answer explains that if a Hireling is removed from play for any reason, the items the Hireling was carrying are lost and not retained by the player. The true answer states that when a hireling is removed from play due to bad stuff, the items carried by the hireling are also removed from play. Both the student answer and the true answer convey the same core information: that the player does not retain the items when the hireling is removed from play.
GRADE: CORRECT

QUESTION: When can I discard a Race card?
STUDENT ANSWER: To discard a Race card in Munchkin, you can do so to sell for a level, trade with another player, give to a player who wants it, power a Class or Race ability, or comply with a Curse or monster's Bad Stuff effect.
TRUE ANSWER: You can discard your Race card at any time.
EXPLANATION: The true answer states that you can discard your race at any time. The student answer suggests scenarios in which a race card might be discarded, but fails to mention the race card may be discarded at any time. Therefore, the student answer is incorrect.
GRADE: INCORRECT

QUESTION: What is an item in play but not equipped called?
STUDENT ANSWER: In Munchkin, an item that is in play but not equipped is referred to as an item turned sideways. These items are not being used during gameplay.
TRUE ANSWER: An item in play but not equipped is called a carried item.
EXPLANATION: The true answer states that an item in play but not equipped is called a carried item. The student answer makes no mention of a "carried item". Therefore, the student answer is incorrect.
GRADE: INCORRECT

QUESTION: {query}
STUDENT ANSWER: {result}
TRUE ANSWER: {answer}
EXPLANATION:"""
eval_prompt = PromptTemplate(
    input_variables=["query", "result", "answer"], template=eval_template
)

def predict_answer(test_cases, chain):
    test_cases = copy.deepcopy(test_cases)
    for test_case in test_cases:
        query = test_case["query"]
        result = chain.invoke(query)
        context = None
        if isinstance(result, dict):
            context = result.get("context")
            result = result.get("answer")
        result = result.content
        test_case["result"] = result
        if context is not None:
            test_case["context"] = context
    return test_cases


def grade(test_cases, llm):
    test_cases = copy.deepcopy(test_cases)
    eval_chain = QAEvalChain.from_llm(llm, prompt=eval_prompt)
    results = eval_chain.evaluate(examples=test_cases, predictions=test_cases)
    # It looks like, once upon a time, the eval chain did this bit for us... but
    # now it doesn't? Maybe a bug? Unclear as there seem to have been other 
    # intentional changes in behavior. Anyway, we do it ourselves now.
    results = [eval_chain._prepare_output(result) for result in results]

    # Merge the test cases and results
    for test_case, result in zip(test_cases, results):
        test_case.update(result)

    return test_cases


def print_graded(graded):
    for test_case in graded:
        print("query:", test_case["query"])
        print("answer:", test_case["result"])
        print("reference:", test_case["answer"])
        print("reasoning:", test_case["reasoning"])
        print("score:", test_case["score"])
        if "expected_score" in test_case:
            print("expected_score:", test_case["expected_score"])
        print()


def get_overall_score(scores):
    return sum([score["score"] or 0 for score in scores]) / len(scores)

In [41]:
example_test_cases = test_cases[:1]
example_test_cases = predict_answer(example_test_cases, complete_chain)
example_test_cases = grade(example_test_cases, grader_chat_model.with_config({"callbacks": [ConsoleCallbackHandler()]}))
print_graded(example_test_cases)

[32;1m[1;3m[llm/start][0m [1m[1:llm:HuggingFaceChatModel] Entering LLM run with input:
[0m{
  "prompts": [
    "System: You are an AI assistant to help board game players find answers to their rule questions.\nHuman: You are an AI assistant to help board game players find answers to their rule questions.\n\nAnswer the question based only on the following board game rule excerpts. Do not use any other information. Never use the word \"excerpts\" in your answer. Simply refer to the context as the rules.\n\n----\n---\nNAME: munchkin_rules/munchkin_rules-1.pdf\nPAGE: 2\nPASSAGE:\nTURN PHASES\nYour turn begins as soon as the previous player's turn ends. When your cards are arranged the way you want, go to phase iL (1) Kick Open The Door: Draw one card from the Door deck and turn it face up.\nIf it’s a monster, you must fight it. See Combat, p- 3. If the card is a curse - see Curses, p- Sait applies to you immediately (if it can) and is discarded (unless it has a persistent effect or yo

### Grade the Grader

We have a set of QA pairs that have been evaluated by a human grader for
correctness. We use these examples to compare the performance of our llm grader
to the human grader.

In [42]:
with open("grader_test_cases.json", "r") as f:
    grader_test_cases = json.loads(f.read())
graded = grade(grader_test_cases, chat_model.bind(temperature=0, max_tokens=1000))
grader_got_wrong = [grade for grade in graded if grade["score"] != grade["expected_score"]]
print_graded(grader_got_wrong)
sum(1 if grade["score"] == grade["expected_score"] else 0 for grade in graded) / len(graded)

[32;1m[1;3m[llm/start][0m [1m[1:llm:HuggingFaceChatModel] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: You are a teacher grading a quiz.\nYou are given a question, the student's answer, and the true answer, and are asked to score the student answer as either CORRECT or INCORRECT.\nWrite out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset.\nThe student answer may contain additional information, it may also provide clarifications and exceptions to the true answer, so long as it does not contradict the true asnwer. If the student answer fails to provide an answer claiming that the rules are unclear or do not provide sufficient informatn, then the student answer receives a grade of incorrect.\nThere is no partial credit. Answers are either CORRECT or INCORRECT.\nAt the end, always output \"GRADE: CORRECT\" or \"GRADE: INCORRECT\" (without the quotes) to indicate your final conclus

0.8571428571428571

### Evaluate chain

In [45]:
# 22m 16.4s
# 0.7619047619047619
if True:
    predictions = predict_answer(test_cases, complete_chain)
    with open("test_cases.json", "w") as f:
        f.write(json.dumps(predictions))
    # graded = grade(predictions, chat_model)
    # print_graded(graded)
    # get_overall_score(graded)

with open("test_cases.json", "r") as f:
    predictions = json.loads(f.read())

graded = grade(predictions, chat_model.bind(temperature=0, max_tokens=1000))
print_graded(graded)
get_overall_score(graded)

[32;1m[1;3m[llm/start][0m [1m[1:llm:HuggingFaceChatModel] Entering LLM run with input:
[0m{
  "prompts": [
    "System: You are an AI assistant to help board game players find answers to their rule questions.\nHuman: You are an AI assistant to help board game players find answers to their rule questions.\n\nAnswer the question based only on the following board game rule excerpts. Do not use any other information. Never use the word \"excerpts\" in your answer. Simply refer to the context as the rules.\n\n----\n---\nNAME: munchkin_rules/munchkin_rules-1.pdf\nPAGE: 2\nPASSAGE:\nTURN PHASES\nYour turn begins as soon as the previous player's turn ends. When your cards are arranged the way you want, go to phase iL (1) Kick Open The Door: Draw one card from the Door deck and turn it face up.\nIf it’s a monster, you must fight it. See Combat, p- 3. If the card is a curse - see Curses, p- Sait applies to you immediately (if it can) and is discarded (unless it has a persistent effect or yo

0.6190476190476191

In [46]:
for doc in retriever.invoke("Can I carry multiple big items so long as only one is equipped?"):
    print(doc.page_content)
    print()
    print()
    

ITEMS
Most Treasures are Items. Items have a Gold Piece value. “No Value” is equivalent to zero Gold Pieces, and a “No Value” card is considered an Item.
All Items you have in play are considered “carried.” Items that are actually giving you a bonus are “equipped.” You should indicate Items that are not equipped by turning the cards sideways. You may not alter the status of your Items during a combat or while running away.
Anyone can carry any Item (except for extra Big items; see below), but you may equip only one Headgear, one suit of Armor, one pair of Footgear, and two “I Hand” Items (or one “2 Hands” Item) . . . unless you have a card that lets you ignore these limits, such as Hireling or Cheat!, or unless one of the cards says otherwise. If you are carrying two Headgear cards, for instance, you can equip only one of them at a time.
Likewise, some Items have restrictions: for instance, the Mace of Sharpness can only be wielded by a Cleric. Its bonus only counts for someone who is,

## Optimize Parameters

In [25]:
retriever_config = RetrieverConfig(
    max_context_size=4096,
    percent_context_use=0.75,
    parent_percent=0.15,
    parent_overlap_percent=0.1,
    child_percent=0.25,
    child_overlap_percent=0.1,
)

sampling_config = SamplingConfig(
    temperature=0.7,
    # top_k=0,
    # top_p=None,
    samples=5,
)

rag_config = RagChainConfig(
    thread_of_thought_enabled=True,
)

retriever, rag_chain_with_context = build_complete_chain(
    chat_model=chat_model,
    tokenizer=tokenizer,
    vectorstore=db,
    retriever_config=retriever_config,
    sampling_config=sampling_config,
    rag_config=rag_config,
)
# print(rag_chain_with_context)
rag_chain = rag_chain_with_context | itemgetter("answer")

In [27]:
retriever.add_documents(rule_docs)

In [30]:
from langchain.callbacks.tracers import ConsoleCallbackHandler

# rag_chain = build_basic_rag_chain(
#     retriever=retriever,
#     chat_chain=chat_model,
#     prompt=rag_config.rag_prompt,
# )

rag_chain_trace = rag_chain.with_config({'callbacks': [ConsoleCallbackHandler()]})

rag_chain_trace.invoke("Can I play a Go Up a Level card during combat?")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "Can I play a Go Up a Level card during combat?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel] Entering Chain run with input:
[0m{
  "input": "Can I play a Go Up a Level card during combat?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 3:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "Can I play a Go Up a Level card during combat?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 3:chain:RunnablePassthrough] [0ms] Exiting Chain run with output:
[0m{
  "output": "Can I play a Go Up a Level card during combat?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel] [1ms] Exiting Chain run with output:
[0m{
  "question": "Can I play a Go Up a Level card during combat?"
}
[32;1m[1;3m[c

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mas

[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 4:chain:thread-of-thought > 15:chain:RunnableAssign > 16:chain:RunnableParallel > 17:chain:RunnableSequence > 20:chain:chat-sampling] [39.64s] Exiting Chain run with output:
[0m{
  "output": [
    {
      "lc": 1,
      "type": "constructor",
      "id": [
        "langchain",
        "schema",
        "messages",
        "AIMessage"
      ],
      "kwargs": {
        "content": "The question is about whether or not a \"Go Up a Level\" card can be played during combat in the game Munchkin.\n\nThe context starts with a passage from the rules of the game which lists actions that can be taken at any time and actions that can be taken on your own turn. The \"Go Up a Level\" card is listed as an action that can be taken on your own turn.\n\nThe passage then moves on to discuss disputes between cards and rules. It states that nothing can reduce a player below Level 1, but that a player can go up a level after combat only if they h

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 4:chain:thread-of-thought > 15:chain:RunnableAssign > 16:chain:RunnableParallel > 17:chain:RunnableSequence > 21:chain:RunnableBranch > 23:chain:RunnableEach > 24:chain:RunnableSequence > 26:llm:HuggingFaceChatModel] [8.73s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "No, a \"Go Up a Level\" card cannot be played during combat in the game Munchkin. According to the rules, a player can only go up a level after defeating a monster, and this must be done on their own turn. Additionally, the rules state that nothing can reduce a player below Level 1, so a player cannot use a \"Go Up a Level\" card to increase their level during combat.",
        "generation_info": null,
        "type": "ChatGeneration",
        "message": {
          "lc": 1,
          "type": "constructor",
          "id": [
            "langchain",
            "schema",
            "messages",
            "AIMessage"
     

AIMessage(content='No, a "Go Up a Level" card cannot be played during combat in the game Munchkin. According to the rules, a player can only go up a level after defeating a monster, and this must be done on their own turn. Additionally, the rules state that nothing can reduce a player below Level 1, so a player cannot use a "Go Up a Level" card to increase their level during combat.')

In [41]:
chain = (
    RunnablePassthrough.assign(result=itemgetter("query") | rag_chain | StrOutputParser())
    | QAEvalChain.from_llm(chat_model)
) 

chain.with_config({"callbacks": [ConsoleCallbackHandler()]}).invoke(
    {
        "query": "If a monster does not pursue me because my level is too low, can I still loot the room?",
        "result": "No, if a monster does not pursue me because my level is too low, I cannot loot the room. According to the rules, a player can only go up a level after defeating a monster in combat. If the monster does not pursue me, I cannot defeat it and therefore cannot go up a level. Additionally, the rules state that a player cannot collect rewards for defeating a monster in the middle of a combat. If I do not defeat the monster, I cannot collect any rewards. Therefore, I cannot loot the room."
        "answer": "No, you cannot loot the room.",
    }
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "query": "If a monster does not pursue me because my level is too low, can I still loot the room?",
  "answer": "No, you cannot loot the room."
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableAssign] Entering Chain run with input:
[0m{
  "query": "If a monster does not pursue me because my level is too low, can I still loot the room?",
  "answer": "No, you cannot loot the room."
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableAssign > 3:chain:RunnableParallel] Entering Chain run with input:
[0m{
  "query": "If a monster does not pursue me because my level is too low, can I still loot the room?",
  "answer": "No, you cannot loot the room."
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableAssign > 3:chain:RunnableParallel > 4:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "q

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_

[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableAssign > 3:chain:RunnableParallel > 4:chain:RunnableSequence > 8:chain:RunnableParallel > 10:chain:thread-of-thought > 18:chain:chat-sampling] [34.94s] Exiting Chain run with output:
[0m{
  "output": [
    {
      "lc": 1,
      "type": "constructor",
      "id": [
        "langchain",
        "schema",
        "messages",
        "AIMessage"
      ],
      "kwargs": {
        "content": "Q: If a monster does not pursue me because my level is too low, can I still loot the room?\n\nStep 1: Understand the context\n\nThe passage is from a rulesheet for the card game \"Munchkin.\" In this game, players take on the role of adventurers who fight monsters to gain levels and treasure. The rules for combat and looting are outlined in the passage.\n\nStep 2: Analyze the question\n\nThe question asks if a player can still loot the room if a monster does not pursue them because their level is too low.\n\nStep 3: Summariz

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableAssign > 3:chain:RunnableParallel > 4:chain:RunnableSequence > 8:chain:RunnableParallel > 10:chain:thread-of-thought > 19:chain:RunnableBranch > 21:chain:RunnableEach > 24:chain:RunnableSequence > 26:llm:HuggingFaceChatModel] [15.47s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "No, if a monster does not pursue me because my level is too low, I cannot loot the room. According to the rules, a player can only go up a level after defeating a monster in combat. If the monster does not pursue me, I cannot defeat it and therefore cannot go up a level. Additionally, the rules state that a player cannot collect rewards for defeating a monster in the middle of a combat. If I do not defeat the monster, I cannot collect any rewards. Therefore, I cannot loot the room.",
        "generation_info": null,
        "type": "ChatGeneration",
        "message": {
          "lc": 1,
         

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableAssign > 3:chain:RunnableParallel > 4:chain:RunnableSequence > 29:chain:universal-consistency > 30:chain:RunnableSequence > 43:chain:RunnableAssign > 44:chain:RunnableParallel > 45:chain:RunnableSequence > 47:llm:HuggingFaceChatModel] [476ms] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "The most consistent response is Response 2.",
        "generation_info": null,
        "type": "ChatGeneration",
        "message": {
          "lc": 1,
          "type": "constructor",
          "id": [
            "langchain",
            "schema",
            "messages",
            "AIMessage"
          ],
          "kwargs": {
            "content": "The most consistent response is Response 2."
          }
        }
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableAssign > 3:chain:RunnableParallel > 

{'query': 'If a monster does not pursue me because my level is too low, can I still loot the room?',
 'answer': 'No, you cannot loot the room.',
 'result': 'No, if a monster does not pursue me because my level is too low, I cannot loot the room. According to the rules, a player can only go up a level after defeating a monster in combat. If the monster does not pursue me, I cannot defeat it and therefore cannot go up a level. Additionally, the rules state that a player cannot collect rewards for defeating a monster in the middle of a combat. If I do not defeat the monster, I cannot collect any rewards. Therefore, I cannot loot the room.',
 'results': 'CORRECT.'}

In [43]:
m = "As a content reviewer, I provide multiple retrieved passages about this question; you need to answer the question.\n\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n----\n\n---\nNAME: munchkin_rules/munchkin_rules-1.pdf\nPAGE: 2\nPASSAGE:\nWhen You May Take Actions\nYou may perform these actions at any time: & Discard a Class or Race.\n& Play a Go Up a Level or Hireling. Play a Curse.\nYou may perform these actions at any time, as long as you are not in combat: Trade an Item with another player (the other player may not be in combat, either).\ny Change which Items you have equipped.\nPlay a card that you have just received (some cards may be played even during combat; see above).\nYou may perform these actions on your own turn: 3 Play a new Class or Race card (at any time).\nwe Sell Items for levels (except when you are in combat). Play an Item (most Items cannot be played during combat, but some one-shot Items can; see p- 3).\n---\n\n\n---\nNAME: munchkin_rules/munchkin_rules-1.pdf\nPAGE: 1\nPASSAGE:\nConflicts Between Cards and Rules\nThis rulesheet gives the general tules. Many cards add special rules, so in most cases when the rulesheet disagrees with a card, follow the card. However, ignore any card effect that might seem to contradict one of the rules listed below unless the card explicitly says it supersedes that rule!\niL Nothing can reduce a player below Level 1, although card effects might reduce a player's or a monster's combat strength (p. 3) below I.\n2. You go up a level after combat only if you Ail a monster.\n3. You cannot collect rewards for defeating a monster (eg., Treasure, levels) in the middle of a combat. You must finish the fight before gaining any rewards.\n4. You must killa monster to reach Level 10, and you cannot force another player to help you do it.\nAny other disputes should be settled by loud arguments, with the owner of the game having the last word. You could also read the Munchkin FAQ and errata pages at munchkin.game, or start a discussion at forums.sjgames.com/, munchkin . . . unless it’s more fun to argue.\n\nSTEVE JACKSON GAMES\nYour Hand: Cards in your hand are not in play. They don’t help you, but they can’t be taken away except by cards that specifically affect “your hand.” At the end of your turn, you may have no more than five cards in your hand (see Charity, p- 2).\nCards in play may not be returned to your hand - they must be discarded or traded if you want to get rid of them.\n---\n\n----\n\nQ: Can I play a Go Up a Level card during combat?\nWalk me through this context in manageable parts step by step, summarizing and analyzing as we go.\n"
print(m)
print()
print()
print(chat_model.invoke(m).content)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


As a content reviewer, I provide multiple retrieved passages about this question; you need to answer the question.

If you don't know the answer, just say that you don't know, don't try to make up an answer.

----

---
NAME: munchkin_rules/munchkin_rules-1.pdf
PAGE: 2
PASSAGE:
When You May Take Actions
You may perform these actions at any time: & Discard a Class or Race.
& Play a Go Up a Level or Hireling. Play a Curse.
You may perform these actions at any time, as long as you are not in combat: Trade an Item with another player (the other player may not be in combat, either).
y Change which Items you have equipped.
Play a card that you have just received (some cards may be played even during combat; see above).
You may perform these actions on your own turn: 3 Play a new Class or Race card (at any time).
we Sell Items for levels (except when you are in combat). Play an Item (most Items cannot be played during combat, but some one-shot Items can; see p- 3).
---


---
NAME: munchkin_rul

In [70]:
# # Using regular LLM interface
# from langchain.llms import VLLMOpenAI

# llm = VLLMOpenAI(
#     openai_api_key="EMPTY",
#     openai_api_base="http://localhost:8000/v1",
#     temperature=0.1,
#     # model_kwargs=dict(repetition_penalty=1.1),
#     max_tokens=2_000,
#     model_name=model_name,
#     frequency_penalty=0.2,
# )
# print(llm("[INST] Generate 10 names for a fantasy elf Paladin. [/INST] "))

1. Galadriel
2. Elrond
3. Legolas
4. Arwen
5. Thranduil
6. Faramir
7. Eärendil
8. Lúthien
9. Glorfindel
10. Celebrindor


In [13]:
# To parse the PDFs, there are three strategies available: "fast", "hi_res", and
# "ocr_only". For the PDFs used here, "fast" retrieves a bunch of duplicate text
# in the wrong order. "hi_res" doesn't handle columns of text well and produces
# incoherent results. "ocr_only" seems to work reasonably well in this case.
rule_docs = []
for filename in data_path.glob("*.pdf"):
    print(f"Processing {filename}")
    loader = UnstructuredPDFLoader(filename, strategy="ocr_only")
    rule_docs.extend(loader.load())

Processing munchkin_rules/munchkin_rules-1.pdf


Processing munchkin_rules/puppies-rules.pdf
Processing munchkin_rules/princesses_rules.pdf
Processing munchkin_rules/munch_4_rules_20thp.pdf


In [14]:
# Chunk text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
chunked_documents = text_splitter.split_documents(rule_docs)

In [17]:
!nvidia-smi

Fri Dec  1 23:51:35 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 525.147.05   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:07:00.0 Off |                  N/A |
|  0%   47C    P8    23W / 420W |    139MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [18]:
# import faiss
# Load chunked documents into the FAISS index
db = FAISS.from_documents(
    chunked_documents, 
    embedding_model
)

In [29]:
!nvidia-smi

Fri Dec  1 23:57:07 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 525.147.05   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:07:00.0 Off |                  N/A |
|  0%   49C    P8    21W / 420W |  22475MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [57]:
prompt_template = """[INST] 
Instruction: You are an assitant to help answer questions about board game rules. Answer questions concisely and in one or two sentences. Rely upon the following passages from the rulebook when answering questions.

{context}

QUESTION:
{question} 
[/INST]"""

# text_generation_pipeline = transformers.pipeline(
#     model=model,
#     tokenizer=tokenizer,
#     task="text-generation",
#     # temperature=0.2,
#     # repetition_penalty=1.1,
#     # return_dict_in_generate=True,
#     # output_scores=True,
#     return_full_text=True,
#     max_new_tokens=1000,
# )
# text_generation_pipeline.model.config.pad_token_id = text_generation_pipeline.model.config.eos_token_id

# mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain 
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Create retriever
# retrieve_topk
retriever = db.as_retriever(search_kwargs={"k": 3})

def format_docs(docs):
    passages = []
    for i, doc in enumerate(docs):
        passages.append(f"Passage {i+1}: {doc.page_content}")
    return "\n\n".join(passages)

# Create rag chain
rag_chain = ( 
 {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | llm_chain
)

In [None]:
import langchain
langchain.debug = False

In [58]:
for i, test_case in enumerate(test_cases):
    # # Hack to avoid a warning about horribly inefficient our use of the GPU is
    # text_generation_pipeline.call_count = 0

    result = rag_chain.invoke(test_case["question"])

    print("input:", test_case["question"].strip())
    print("reference:", test_case["answer"].strip())
    print("prediction:", result["text"].strip())

    if i < len(test_cases) - 1:
        print()
        print()

input: If a monster does not pursue me because my level is too low, can I still loot the room?
reference: No. If a monster does not pursue you, it means you automatically run away from it. However, you still count as having been in combat and cannot loot the room.
prediction: No, if a monster does not pursue you because your level is too low, you cannot loot the room.


input: Can I sell items from my hand to go up a level, assuming I can sell 1,000 gold pieces worth?
reference: Yes. You can sell items from your hand to go up a level.
prediction: No, you cannot sell items from your hand to go up a level. You can only sell items worth a total of at least 1,000 Gold Pieces and immediately go up one level.


input: If a hireling is removed from play due to Bad Stuff, does the player retain any items the hireling was carrying?
reference: No. When a hireling is removed from play due to bad stuff, any items the hireling was carrying are also removed from play.
prediction: If a Hireling is re

In [60]:
def ask(question):
    result = rag_chain.invoke(question)
    print("CONTEXT")
    print(result["context"])
    # for i, doc in enumerate(result["context"]):
    #     print(f"Document {i}:")
    #     print(f"{doc}")
    #     print("\n")
    print()
    print("ANSWER")
    print(result["text"].strip())

In [61]:
ask("Can I play a Go Up a Level card during combat on my turn?")

CONTEXT
Passage 1: Hireling may be played at any time, on any turn. You cannot give a Hireling an Item to carry while you are in combat, however.

COMBAT

To fight a monster, compare its combat strength to yours. Combat strength is the total of Level plus all modifiers - positive or negative - given by Items and other cards. If the monster's combat strength is equal to yours, or greater, you lose the combat and must Run Away (see p. 5). If your combat strength totals more than the monster's — note that monsters win ties! — you kill it and goupa level (two levels for some big monsters). You'll also get the number of Treasures shown on its card.

Sometimes a card will let you get rid of the monster without killing it. This is still “winning,” but you don't get a level. Unless the ability says otherwise, you don’t get the Treasures, either. If the last monster is removed from a combat, it ends instantly.

Some monster cards have special powers that affect combat

Passage 2: killing a mons

In [77]:
# Lets try thread of thought
prompt_template = """[INST] As a content reviewer, I provide multiple passages about this question; you need to answer the question.
Passage 1: 

Hireling may be played at any time, on any turn. You cannot give a Hireling an Item to carry while you are in combat, however.

COMBAT

To fight a monster, compare its combat strength to yours. Combat strength is the total of Level plus all modifiers - positive or negative - given by Items and other cards. If the monster's combat strength is equal to yours, or greater, you lose the combat and must Run Away (see p. 5). If your combat strength totals more than the monster's — note that monsters win ties! — you kill it and goupa level (two levels for some big monsters). You'll also get the number of Treasures shown on its card.

Sometimes a card will let you get rid of the monster without killing it. This is still “winning,” but you don't get a level. Unless the ability says otherwise, you don’t get the Treasures, either. If the last monster is removed from a combat, it ends instantly.

Some monster cards have special powers that affect combat

Passage 2: 

killing a monster, unless a card specifically allows you to win another way.

When You May Take Actions

You may perform these actions at any time: & Discard a Class or Race.

& Play a Go Up a Level or Hireling. Play a Curse.

You may perform these actions at any time, as long as you are not in combat:

Trade an Item with another player (the other player may not be in combat, either).

y Change which Items you have equipped.

Play a card that you have just received (some cards may be

played even during combat; see above).

You may perform these actions on your own turn:

3 Play a new Class or Race card (at any time).

we Sell Items for levels (except when you are in combat). Play an Item (most Items cannot be played during combat, but some one-shot Items can; see p- 3).

TURN PHASES

Your turn begins as soon as the previous player's turn ends. When your cards are arranged the way you want, go to phase iL

(1) Kick Open The Door: Draw one card from the Door deck and turn it face up.

Passage 3: 

Conflicts Between Cards and Rules

This rulesheet gives the general tules. Many cards add special rules, so in most cases when the rulesheet disagrees with a card, follow the card. However, ignore any card effect that might seem to contradict one of the rules listed below unless the card explicitly says it supersedes that rule!

iL Nothing can reduce a player below Level 1, although card effects might reduce a player's or a monster's combat strength (p. 3) below I.

2. You go up a level after combat only if you Ail a monster.

3. You cannot collect rewards for defeating a monster (eg., Treasure, levels) in the middle of a combat. You must finish the fight before gaining any rewards.

4. You must killa monster to reach Level 10, and you cannot force another player to help you do it.

Question: Can I play a Go Up a Level card during combat on my turn?
Walk me through this context in manageable parts step by step,
summarizing and analyzing as we go.
Answer:
"""

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=[],
    template=prompt_template,
)

# Create llm chain 
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Create retriever
# retrieve_topk
retriever = db.as_retriever(search_kwargs={"k": 3})

def format_docs(docs):
    passages = []
    for i, doc in enumerate(docs):
        passages.append(f"Passage {i+1}: {doc.page_content}")
    return "\n\n".join(passages)

# Create rag chain
# rag_chain = ( 
#  {"context": retriever | format_docs, "question": RunnablePassthrough()}
#     | llm_chain
# )

In [78]:
thot_chain = llm_chain
print(thot_chain.invoke({})["text"])


Passage 1:

* Hireling can be played at any time, on any turn.
* Cannot give a Hireling an Item to carry while in combat.

Passage 2:

* You may perform these actions at any time: Discard a Class or Race, Play a Go Up a Level or Hireling, Play a Curse.
* You may perform these actions on your own turn: Play a new Class or Race card, Sell Items for levels, Play an Item.

Passage 3:

* Conflicts Between Cards and Rules.
* Nothing can reduce a player below Level 1.
* You go up a level after combat only if you kill a monster.
* You cannot collect rewards for defeating a monster in the middle of a combat.
* You must kill a monster to reach Level 10 and cannot force another player to help you do it.

Answer: No, you cannot play a Go Up a Level card during combat on your turn because the rules state that you can only go up a level after combat if you kill a monster. Additionally, the rules state that you cannot collect rewards for defeating a monster in the middle of a combat, so you would ne

In [None]:
ask("What is the card limit for how many cards can be in my hand?")

CONTEXT
Any other disputes should be settled by loud arguments, with the owner of the game having the last word. You could also read the Munchkin FAQ and errata pages at munchkin.game, or start a discussion at forums.sjgames.com/, munchkin . . . unless it’s more fun to argue.

STEVE JACKSON GAMES

Your Hand: Cards in your hand are not in play. They don’t help you, but they can’t be taken away except by cards that specifically affect “your hand.” At the end of your turn, you may have no more than five cards in your hand (see Charity, p- 2).

Cards in play may not be returned to your hand - they must be discarded or traded if you want to get rid of them.

CHARACTER CREATION

Everyone starts as a Level | human with no class. (Heh, heh.) Munchkin characters may be either male or female. Your character's sex is the same as your own at the start of the game, unless you declare otherwise.

Anyone can carry any Item (except for extra Big items; see below), but you may equip only one Headgear, 