In [1]:
import csv
from bs4 import BeautifulSoup
import requests
import ollama
import openai
import psycopg2

from pgvector.psycopg2 import register_vector
from dotenv import load_dotenv
import os
import numpy as np
from rich.jupyter import print
from psycopg2.extras import RealDictCursor
from IPython.display import display, Markdown

In [15]:
gpt_model = "llama3.2" 
ollama_url = 'http://localhost:11434/v1'

## Simple example

In [111]:
output = ollama.generate(
  model=gpt_model,
  
  prompt=f"What is the second family name of Javier Martinez according to spanish rule of family names? you are force to provide an answer and provide a single word answer"
)
output.get("response")

'Sánchez.'

In [110]:
data = "Javier Martinez full name is Javier Martinez Alcantara"
prompt= "What is the second family name of Javier Martinez according to spanish rule of family names? you are force to provide an answer and provide a single word answer"

output = ollama.generate(
  model=gpt_model,
  prompt=f"Using this context: {data}. Respond to this prompt: {prompt}",
)
output.get("response")

'Alcantara'

## Go for a functional RAG with a use case based on ...

![alt text](./assets/warhammer.jpg "Warhammer")

# Data Extraction

In [None]:

url="https://tow.whfb.app"

html_content = requests.get(url).text


soup = BeautifulSoup(html_content)

soup.findAll("a")
links = soup.findAll("a")

with open(f"old_world_rules3.csv", 'w', newline='') as out_file:
    headers = [
        "rule_name", 
        "link_number",
        "link",
        "rulebook_page",
        "rule_description"
    ]
    writer = csv.DictWriter(out_file, headers)
    writer.writeheader()
    total_links = len(links)
    
    id = 0
    for number, link in enumerate(links):
        try:
            # if number == 20: 
            #     break
            l = url + link.get("href")

            print(f"Processing {number} out of {total_links}")
            html_content = requests.get(l).text


            # Parse the html content
            subsoup = BeautifulSoup(html_content)
            rule_name = subsoup.find("h1", attrs={"class": "page-title"}).text.removesuffix("URL Copied!")
            rulebook_page = subsoup.find("li", attrs={"class": "page-reference"}).text
            rule_description = ""

            if subsoup.find("article", attrs={"class": "article--rich-text"}):
                rule_description = subsoup.find("article", attrs={"class": "article--rich-text"}).text
            
            if subsoup.find("article", attrs={"class": "article--rich-text section-intro"}):
                rule_description += subsoup.find("article", attrs={"class": "article--rich-text section-intro"}).text
            
            row = {
                "rule_name": rule_name,
                "link_number": str(id),
                "link": l,
                "rulebook_page": rulebook_page,
                "rule_description": rule_description
                }

            writer.writerow(row)
            id += 1

            spells = subsoup.find("div", attrs={"class": "spell-list"}).findAll("a")
            spells = [ spell for spell in spells if "/spell" in spell.get("href")]
            # print(spells)
            if spells:
                for spell in spells:
                    print(spell)
                    spell_link = url + spell.get("href")

                    html_content = requests.get(spell_link).text 
                    spell = BeautifulSoup(html_content)
                    
                    spell_name = spell.find("h1", attrs={"class": "page-title"}).text.removesuffix("URL Copied!")
                    spell_rulebook_page = spell.find("li", attrs={"class": "page-reference"}).text

                    descriptions = spell.findAll("article", attrs={"class": "article--rich-text"})
                    spell_description = descriptions[0].text
                    
                    spell_info = [el.text for el in descriptions[1].findAll("p") if el.text]
                    # print(spell_info)
                    spell_description += " " + spell_info.pop()
                    # print(spell_info)
                    for info in range(0,len(spell_info),2):
                        spell = spell_info[info+1].replace('"', " inches")
                        spell_description += f" {spell_info[info]}:{spell},"
                    
                    spell_description = spell_description.removesuffix(",")
                    row = {
                        "rule_name": spell_name,
                        "link_number": str(id),
                        "link": spell_link,
                        "rulebook_page": spell_rulebook_page,
                        "rule_description": spell_description
                        }

                    writer.writerow(row)
                    id += 1

        except Exception as e:
            print(e)
            continue

# Database initialization

In [112]:
# Set up Postgres
load_dotenv(override=True)
DBUSER = os.environ ["DBUSER"]
DBPASS = os.environ["DBPASS"]
DBHOST = os.environ["DBHOST"]
DBNAME = os.environ ["DBNAME" ]

# Use SSL if not connecting to localhost
DBSSL = "disable"
if DBHOST != "localhost":
    DBSSL = "require"
    
conn = psycopg2.connect(database=DBNAME, user=DBUSER, password=DBPASS, host=DBHOST, sslmode=DBSSL)
conn.autocommit = True
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)

# Embeddings generation

In [37]:
# Set up Github models
endpoint = "https://models.inference.ai.azure.com"
model_name = "text-embedding-3-small"

Generating the table that will contain the embeddings

In [113]:
EMBEDDINGS_SIZE = 1024

cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
cur.execute("DROP TABLE IF EXISTS rules")
cur.execute(f"""CREATE TABLE rules (
            link_number int PRIMARY KEY,
            rule_name varchar,
            link varchar,
            rulebook_page varchar,
            rule_description varchar,
            embedding vector({EMBEDDINGS_SIZE})
            );""")
register_vector(conn)

cur.execute("CREATE INDEX ON rules USING hnsw (embedding vector_cosine_ops)")
#Hierarchical Navigable Small World 


Reading rules from our disk and load them into memory:

In [114]:
with open(f"old_world_rules3.csv", 'r', newline='') as csvfile:
    rules = csv.DictReader(csvfile, delimiter=',')


    # Iterating the rules
    for rule in rules:
        link_number = rule.get("link_number")
        rule_name = rule["rule_name"]
        link = rule["link"]
        rulebook_page = rule["rulebook_page"]
        rule_description = rule["rule_description"]

        embedding = ollama.embeddings(
            model="mxbai-embed-large", #https://www.mixedbread.ai/blog/mxbai-embed-large-v1
            prompt=f"Rule: {rule_name}. Rule Description: {rule_description}"
            )
        # print(f"{link_number}, {rule_name}, {link}, {rulebook_page}, {rule_description}, {embedding.get("embedding")}")
        cur.execute(
            "INSERT INTO rules (link_number, rule_name, link, rulebook_page, rule_description, embedding) VALUES (%s,%s,%s,%s,%s,%s)", 
            (link_number, rule_name, link, rulebook_page, rule_description, embedding.get("embedding"),)
            )
        


# Comparison with embeddings using vector search

In [115]:
query = "SELECT rule_name, embedding <-> %s as distance FROM rules ORDER BY distance LIMIT 20"

prompt = "how to charge a unit?"
cur = conn.cursor(cursor_factory=RealDictCursor)
embedding = np.array(ollama.embeddings(model="mxbai-embed-large", prompt=prompt).get("embedding"))
cur.execute(query, (embedding,))
results=cur.fetchall()
print(results)


In [117]:
query = "SELECT rule_name, embedding <-> %s as distance FROM rules ORDER BY distance LIMIT 20"

prompt = "what is an asrai?"
cur = conn.cursor(cursor_factory=RealDictCursor)
embedding = np.array(ollama.embeddings(model="mxbai-embed-large", prompt=prompt).get("embedding"))
cur.execute(query, (embedding,))
results=cur.fetchall()
print(results)



![alt text](./assets/wood_elf.jpg "Asrai")

# Comparison with embeddings using full text search

In [119]:
query = """
    SELECT rule_name, ts_rank_cd(to_tsvector('english', rule_description), query) as rank FROM rules, plainto_tsquery('english', %(prompt)s) query
    WHERE to_tsvector('english', rule_description) @@ query
    ORDER BY rank
    DESC LIMIT 20
    """

prompt = "what is an asrai?"
cur = conn.cursor(cursor_factory=RealDictCursor)
embedding = np.array(ollama.embeddings(model="mxbai-embed-large", prompt=prompt).get("embedding"))
cur.execute(query, {"prompt": prompt})
results=cur.fetchall()
print(results)


In [120]:
query = """
    SELECT rule_name, ts_rank_cd(to_tsvector('english', rule_description), query) as rank FROM rules, plainto_tsquery('english', %(prompt)s) query
    WHERE to_tsvector('english', rule_description) @@ query
    ORDER BY rank
    DESC LIMIT 20
    """

prompt = "lance"
cur = conn.cursor(cursor_factory=RealDictCursor)
embedding = np.array(ollama.embeddings(model="mxbai-embed-large", prompt=prompt).get("embedding"))
cur.execute(query, {"prompt": prompt})
results=cur.fetchall()
print(results)

## Hybrid search

In [121]:
prompt = "asrai"
cur = conn.cursor(cursor_factory=RealDictCursor)
embedding = np.array(ollama.embeddings(model="mxbai-embed-large", prompt=prompt).get("embedding"))

query = """
WITH semantic_search AS (
    SELECT link_number, rule_name, RANK () OVER (ORDER BY embedding <=> %(embedding)s) AS rank
    FROM rules
    ORDER BY embedding <> %(embedding)s
    LIMIT 20
),
keyword_search AS (
    SELECT link_number, rule_name, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', rule_description), query) DESC) 
    FROM rules, plainto_tsquery('english', %(prompt)s) query 
    WHERE to_tsvector('english', rule_description) @@ query
    ORDER BY ts_rank_cd(to_tsvector('english', rule_description), query) 
    DESC 
    LIMIT 20
)

SELECT
    COALESCE (semantic_search.rule_name, keyword_search.rule_name) AS rule, 
    COALESCE (1.0 / (%(k)s + semantic_search.rank), 0.0) + COALESCE(1.0 / (%(k)s + keyword_search.rank), 0.0) AS rank
FROM semantic_search
FULL OUTER JOIN keyword_search ON semantic_search.link_number = keyword_search.link_number
ORDER BY rank 
DESC
LIMIT 20
"""


cur.execute(
    query, 
    {"prompt": prompt, "embedding": embedding, "k": 60}
    )
results=cur.fetchall()
print(results)

In [122]:
def ragtime(prompt: str) -> list[dict]:
    cur = conn.cursor(cursor_factory=RealDictCursor)
    embedding = np.array(ollama.embeddings(model="mxbai-embed-large", prompt=prompt).get("embedding"))

    query = """
    WITH semantic_search AS (
        SELECT link_number, RANK () OVER (ORDER BY embedding <=> %(embedding)s) AS rank
        FROM rules
        ORDER BY embedding <> %(embedding)s
        LIMIT 20
    ),
    keyword_search AS (
        SELECT link_number, RANK () OVER (ORDER BY ts_rank_cd(to_tsvector('english', rule_description), query) DESC) 
        FROM rules, plainto_tsquery('english', %(prompt)s) query 
        WHERE to_tsvector('english', rule_description) @@ query
        ORDER BY ts_rank_cd(to_tsvector('english', rule_description), query) 
        DESC 
        LIMIT 20
    )

    SELECT
        COALESCE (semantic_search.link_number, keyword_search.link_number) AS link_number, 
        COALESCE (1.0 / (%(k)s + semantic_search.rank), 0.0) + COALESCE(1.0 / (%(k)s + keyword_search.rank), 0.0) AS rank
    FROM semantic_search
    FULL OUTER JOIN keyword_search ON semantic_search.link_number = keyword_search.link_number
    ORDER BY rank 
    DESC
    LIMIT 5
    """


    cur.execute(
        query, 
        {
            "prompt": prompt, 
            "embedding": embedding, 
            "k": 60 # https://jkatz05.com/post/postgres/hybrid-search-postgres-pgvector/
        }
    )
    results=cur.fetchall()


    rules = [result.get("link_number") for result in results]
    # print(rules)
    cur.execute("SELECT rule_name, link, rulebook_page, rule_description FROM rules WHERE link_number = ANY(%s)", (rules,))
    results= cur.fetchall()

    return [dict(result) for result in results]
    

# Giving shape to the RAG

In [46]:
prompt = "can you explain me the veteran rule and provide me an example"

In [123]:
results= ragtime(prompt)
formatted_results = [f'##{result.get("rule_name")}\n\n{result.get("rule_description")}\n\nRulebook page:{result.get("rulebook_page")}\n\n Reference link:[{result.get("rule_name")}]({result.get("link")})' for result in results]

In [124]:
print(formatted_results)

In [54]:
system_message = (
    "You must answer questions only according to sources."
    "Say you dont know if you cant find answer in sources."
    "Provide an answer within the scope of the rules of the game called the old world."
    "Cite the source inside square brackets."
    "The sources should contain the reference link and the rulebook page."
    "Place the sources at the end as list of markdowns. An example of source: Rulebook, p. 209\n\n Reference link:[Too Many Characters](https://tow.whfb.app/characters/too-many-characters)"
    "The title of each rule will be a markdown heading."
)

In [55]:

client = openai.OpenAI(
    base_url = ollama_url,
    api_key='ollama', # required, but unused
)

response = client.chat.completions.create(
  model=gpt_model,
  messages=[
    {"role": "system", "content": system_message},
    {"role": "user", "content": f"{prompt}\n\n Sources:\n\n {formatted_results}"},
  ],
  temperature=0.3,
  top_p=1.0
)


In [56]:
print(response.choices[0].message.content)

In [92]:
def add_context(prompt: str)-> list[dict[str, str]]:
    system_message = (
    "You must answer questions only according to sources."
    "Say you dont know if you cant find answer in sources."
    "Respond as if you are an expert and within the scope of the rules of the game called the old world."
    "If there isn't enough information below, say you don't know."
    "Always include the references for each source you use in the response."
    "Use square brackets to reference the source, for example [52]."
    "Don't combine citations, list each product separately, for example [27][51]."
    # "Place the source references at the end as list references with rulebook page and reference link."
    "The source references will be always similar to the next example: [1] Rulebook, p. 209\n\n Link:https://tow.whfb.app/characters/too-many-characters"
    "Provide the answer in markdown format with headers"
    # "Use emojis when you consider that it will make it more interesting"
)

    
    results= ragtime(prompt)
    formatted_results = [f'##{result.get("rule_name")}\n\n{result.get("rule_description")}\n\nRulebook page:{result.get("rulebook_page")}\n\n Reference link:[{result.get("rule_name")}]({result.get("link")})' for result in results]

    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": f"In the context of warhammer the old world, provide me an answer to this: {prompt}\n\n using these sources:\n\n {formatted_results}"},
    ]

In [94]:
prompt = "what is the order of phases in warhammer the old world during a turn in the battle?"

response = client.chat.completions.create(
  model=gpt_model,
  messages=add_context(prompt),
  temperature=0.3,
  top_p=1.0
)
display(Markdown(response.choices[0].message.content))

**Turn Phases in Warhammer the Old World**
=============================================

According to the Warhammer: The Old World rulebook, a player's turn is divided into four phases:

### 1. Strategy Phase
[The Strategy Phase Sequence](https://tow.whfb.app/the-strategy-phase/the-strategy-phase-sequence)

* Start of Turn
* Command
* Conjuration (Enchantment or Hex spells)
* Rally Fleeing Troops

### 2. Movement Phase
[The Turn Sequence](https://tow.whfb.app/the-turn-sequence)

* Declaration of Charges and Charge Reactions
* Movement of Charging Units
* Compulsory Movement
* Conveyance Spells

### 3. Shooting Phase
[The Turn Sequence](https://tow.whfb.app/the-turn-sequence)

* Shooting with Missile Weapons, War Machines, and Magic Missiles
* Casting Magical Vortex Spells

### 4. Combat Phase
[The Turn Sequence](https://tow.whfb.app/the-turn-sequence)

* Units fight in Hand-to-Hand Melee
* Wizards cast Assailment Spells to fend off attackers
* Resolution of Combats
* End of Turn

Note that the order of phases is not explicitly stated as a sequence, but rather each phase has its own set of sub-phases and steps.

In [98]:
prompt = "which miniatures base sizes are allowed in the game?"

response = client.chat.completions.create(
  model=gpt_model,
  messages=add_context(prompt),
  temperature=0.3,
  top_p=1.0,
)
display(Markdown(response.choices[0].message.content))

**Base Size in Warhammer: The Old World**
=====================================

According to the Rulebook, all models used in a game of Warhammer: The Old World should be mounted upon a square or rectangular base. The dimensions of which are given here in millimeters (mm).

### Base Sizes
----------------

The following base sizes are allowed in the game:

* Square bases with the following dimensions:
	+ 20x20 mm [Rulebook, p. 98]
	+ 30x30 mm [Rulebook, p. 98]
	+ 40x40 mm [Rulebook, p. 98]
	+ 50x50 mm [Rulebook, p. 98]
	+ 60x60 mm [Rulebook, p. 98]
* Rectangular bases with the following dimensions:
	+ 20x30 mm [Rulebook, p. 98]
	+ 30x40 mm [Rulebook, p. 98]

Note that these are the recommended base sizes, but it's worth noting that some players may have older models with varying base sizes.

### Reference Links
-------------------

* [Model Profiles](https://tow.whfb.app/model-profiles)
* [Base Size](https://tow.whfb.app/model-profiles/base-size)
* [Rulebook page: Rulebook, p. 96](https://tow.whfb.app/characters/too-many-characters#rulebook-p-96)
* [Rulebook page: Rulebook, p. 98](https://tow.whfb.app/characters/too-many-characters#rulebook-p-98)
* [Rulebook page: Rulebook, p. 276](https://tow.whfb.app/warhammer-armies/recommended-size-of-game)
* [Rulebook page: Rulebook, p. 285](https://tow.whfb.app/warhammer-battles/size-of-battlefield)

In [99]:
prompt = "what is the most powerful monster of the game?"

response = client.chat.completions.create(
  model=gpt_model,
  messages=add_context(prompt),
  temperature=0.3,
  top_p=1.0,
)
display(Markdown(response.choices[0].message.content))

**Most Powerful Monster in Warhammer the Old World**
=============================================

Determining the most powerful monster in Warhammer the Old World is a subjective task, as it depends on various factors such as the player's army composition and playstyle. However, based on the provided sources, we can identify some of the most formidable monsters in the game.

**Behemoths: The Largest and Most Powerful**
-----------------------------------------

According to the Rulebook [1], Behemoths are the largest beings in the Warhammer world, including mighty Dragons, Greater Daemons of Chaos, towering Giants, and lumbering constructs of science and sorcery. These creatures are categorized into two sub-categories:

* **Monstrous Creatures**: These include fearsome beasts such as giant spiders, wolves, and bears.
* **Behemoths**: This category includes the largest and most powerful creatures, including Dragons, Greater Daemons, Giants, and constructs.

**Notable Behemoths**
-------------------

Some notable Behemoths mentioned in the sources include:

* **Dragons**: These legendary creatures are known for their immense strength and power.
* **Greater Daemons of Chaos**: As servants of the Dark Gods, these daemons are feared throughout the Warhammer world for their malevolent powers.
* **Giants**: Towering over normal humans, Giants are formidable opponents on the battlefield.

**Conclusion**
----------

While it's difficult to pinpoint a single most powerful monster in Warhammer the Old World, Behemoths are undoubtedly the largest and most fearsome creatures in the game. Dragons, Greater Daemons of Chaos, and Giants are among the most notable Behemoths, each with their unique abilities and strengths.

**References**
--------------

[1] Rulebook, p. 196
[52] [Behemoths](https://tow.whfb.app/troop-types-in-detail/behemoths)

In [100]:
prompt = "what is the magic armour that provides more benefit given its cost?"

response = client.chat.completions.create(
  model=gpt_model,
  messages=add_context(prompt),
  temperature=0.3,
  top_p=1.0,
)
display(Markdown(response.choices[0].message.content))

**Magic Armour in Warhammer The Old World**

In the context of Warhammer The Old World, magic armour provides additional protection to a model's vital areas, but its cost is significant. According to the Rulebook [1], a model can never have an armour value better than 2+, and any bonuses that would improve a model's armour further have no effect.

**Most Cost-Effective Magic Armour**

Among the available magic armours, **Armour of Meteoric Iron** is the most cost-effective option. It costs 20 points [4] and provides its wearer with an armour value of 5+, which cannot be improved in any way. This makes it a valuable investment for models that need high levels of protection.

**Other Notable Magic Armours**

* **Armour of Destiny**: This heavy armour costs 70 points [3] and provides its wearer with a 4+ Ward save against any wounds suffered.
* **Bedazzling Helm**: This helm costs 60 points [2] and improves the wearer's armour value by 1 (to a maximum of 2+) and causes enemies to suffer a -1 modifier to their rolls to hit.

**Comparison**

| Magic Armour | Cost | Armour Value |
| --- | --- | --- |
| Armour of Meteoric Iron | 20 points | 5+ |
| Armour of Destiny | 70 points | 4+ (Ward save) |
| Bedazzling Helm | 60 points | 2+ (+1 bonus) |

Note: The armour values listed are the maximum possible values for each armour, taking into account any bonuses or special rules.

References:

[1] Rulebook, p. 220
[2] [Light Armour](https://tow.whfb.app/weapons-of-war/light-armour)
[3] [Armour of Destiny](https://tow.whfb.app/magic-items/armour-of-destiny)
[4] [Magic Armour](https://tow.whfb.app/magic-items/magic-armour)