In [3]:
from pathlib import Path
from datetime import datetime



In [11]:
import requests
from chromadb import HttpClient
import time
import json
import logging
#from mtg.objects import ChromaDocument


logging.basicConfig(filename='../vector_database.log', level=logging.DEBUG,
                    format='%(levelname)s:%(asctime)s:%(message)s')

logger = logging.getLogger()
handler = logging.StreamHandler()  # Create a handler for stdout
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')  # Customize the log format
handler.setFormatter(formatter)

logger.addHandler(handler)  # Add the handler to the logger
logger.setLevel(logging.INFO)  # Set the logging level to INFO

In [12]:
from pydantic import BaseModel, Field
from typing import Union

class ChromaDocument(BaseModel):
    id: str  # text for display
    document: str  # text for vectorizing
    metadata: dict[str, Union[str, list[str]]] = Field(
        default_factory=dict
    )  # more info
    
    def __repr__(self):
        return f"Document({self.id})"

In [77]:
def get_request(api_url: str):
    response = requests.get(api_url)
    json = response.json()
    return json

In [99]:
# TODO continue here <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# extract chapter names
def full_extract() -> list[ChromaDocument]: 
    logging.info(f"Starting full-extraction")
    chapter_names = get_request("https://api.academyruins.com/cr/toc")
    logging.info(f"Successfully extracted chapter names")
    # extract rules
    rules = get_request("https://api.academyruins.com/cr") 
    logging.info(f"Successfully extracted rules")
    
    return rules, chapter_names

    # TODO: get glossary and unoficcial glossary
    # TODO: check how keywords where added to rules

def delta_extract(ids:list) -> list[ChromaDocument]: 
    logging.info(f"Starting delta-extraction")
    chapter_names = get_request("https://api.academyruins.com/cr/toc")
    logging.info(f"Successfully extracted chapter names")
    # extract rules
    rules = get_request("https://api.academyruins.com/cr") 
    logging.info(f"Successfully extracted rules")
    

    logging.info(f"Successfully extracted documents")
    return rules, chapter_names

    # TODO: get glossary and unoficcial glossary
    # TODO: check how keywords where added to rules


rules, chapters = delta_extract(update_ids)

2024-05-13 00:29:18,960 - root - INFO - Starting delta-extraction
2024-05-13 00:29:18,960 - root - INFO - Starting delta-extraction
2024-05-13 00:29:19,167 - root - INFO - Successfully extracted chapter names
2024-05-13 00:29:19,167 - root - INFO - Successfully extracted chapter names
2024-05-13 00:29:19,540 - root - INFO - Successfully extracted rules
2024-05-13 00:29:19,540 - root - INFO - Successfully extracted rules
2024-05-13 00:29:19,541 - root - INFO - Successfully extracted documents
2024-05-13 00:29:19,541 - root - INFO - Successfully extracted documents


In [100]:
rules

{'100.1': {'ruleNumber': '100.1',
  'examples': None,
  'ruleText': 'These Magic rules apply to any Magic game with two or more players, including two-player games and multiplayer games.',
  'fragment': '1',
  'navigation': {'previousRule': None, 'nextRule': '100.1a'}},
 '100.2': {'ruleNumber': '100.2',
  'examples': None,
  'ruleText': 'To play, each player needs their own deck of traditional Magic cards, small items to represent any tokens and counters, and some way to clearly track life totals.',
  'fragment': '2',
  'navigation': {'previousRule': '100.1b', 'nextRule': '100.2a'}},
 '100.3': {'ruleNumber': '100.3',
  'examples': None,
  'ruleText': 'Some cards require coins or traditional dice. Some casual variants require additional items, such as specially designated cards, nontraditional Magic cards, and specialized dice.',
  'fragment': '3',
  'navigation': {'previousRule': '100.2d', 'nextRule': '100.4'}},
 '100.4': {'ruleNumber': '100.4',
  'examples': None,
  'ruleText': 'Each 

In [None]:
def transform_data(rules:dict, chapter_names:dict) -> list[ChromaDocument]:
    logging.info(f"Starting transformation")
    # transform rules
    documents = []
    chapters = {}

    for section in chapter_names:
        section_title = section['title']
        for subsection in section['subsections']:
            chapterInfo = {
                "sectionNumber": section['number'],
                "sectionTitle": section['title'],
                "subsectionNumber": subsection['number'],
                "subsectionTitle": subsection['title'],
                "combined_title": f"Comprehensive Rules - {section['number']} {section_title} - {subsection['number']} {subsection['title']}"
            }
            chapters[f'{subsection["number"]}'] = chapterInfo

    for rule in rules.values():
        document = ChromaDocument(
            id = rule['ruleNumber'],
            document = f"{rule['ruleNumber']}: {rule['ruleText']}",
            metadata = {
                "documentType": "rule",
                "sectionNumber": f"{chapters.get(rule['ruleNumber'].split('.')[0])['sectionNumber']}",
                "sectionTitle": str(chapters.get(rule['ruleNumber'].split('.')[0])['sectionTitle']),
                "subsectionNumber": str(chapters.get(rule['ruleNumber'].split('.')[0])['subsectionNumber']),
                "subsectionTitle": str(chapters.get(rule['ruleNumber'].split('.')[0])['subsectionTitle']),
                "combined_title": str(chapters.get(rule['ruleNumber'].split('.')[0])['combined_title']),
                "url": f"https://yawgatog.com/resources/magic-rules/#R{rule['ruleNumber'].replace('.', '')}"
            }
        )
        documents.append(document)

        if rule['examples']:
            example_counter = 1
            for example in rule['examples']:
                document = ChromaDocument(
                    id = f"{rule['ruleNumber']} - Example {example_counter}",
                    document = f"{rule['ruleNumber']} - Example {example_counter}: {example}",
                    metadata = {
                        "documentType": "example",
                        "sectionNumber": str(chapters.get(rule['ruleNumber'].split('.')[0])['sectionNumber']),
                        "sectionTitle": str(chapters.get(rule['ruleNumber'].split('.')[0])['sectionTitle']),
                        "subsectionNumber": str(chapters.get(rule['ruleNumber'].split('.')[0])['subsectionNumber']),
                        "subsectionTitle": str(chapters.get(rule['ruleNumber'].split('.')[0])['subsectionTitle']),
                        "combined_title": f"{chapters.get(rule['ruleNumber'].split('.')[0])['combined_title']} - Example {example_counter}",
                        "url": f"https://yawgatog.com/resources/magic-rules/#R{rule['ruleNumber'].replace('.', '')}"
                    }
                )
                documents.append(document)
                example_counter += 1

    return documents

In [14]:
from chromadb.utils import embedding_functions
from chromadb.api.models.Collection import Collection

def get_collection(
        collection_name:str = 'crRules',
        host:str='localhost', 
        port:int=8000, 
        device:str='cpu',  #possible values: 'cpu', 'cuda'
        model_name:str='thenlper/gte-large'
) -> Collection:
    # create chroma client
    client = HttpClient(host, port)
    #TODO: make other embeddingfunctions available 
    ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name, device=device)  # sentence-transformers/all-MiniLM-L6-v2
    # get/create collection
    collection = client.get_or_create_collection(name=collection_name, embedding_function=ef)
    
    return collection

def get_collection_status(collection) -> str:
    if collection.count() = 0:
        return 'not initialized'
    if 




In [15]:
def update_last_successful_load(collection: list[ChromaDocument]):
    try:
        # write timestamp of last successful load to collection metadata
        collection.modify(metadata={
            "lastUpdate": time.time()
        }) 
        logging.info(f"Successfully updated the last successful load timestamp for {collection.name}.")
    except Exception as e:
        logging.exception(f"Error updating the last successful load timestamp for {collection.name}.")
        raise

def upsert_documents_to_collection(documents: list, collection: list[ChromaDocument]) -> None: 
    try:
        ids = [document.id for document in documents]
        # Upsert documents
        collection.upsert(
            ids=ids, 
            documents=[document.document for document in documents],
            metadatas=[document.metadata for document in documents]
        )
        logging.info(f"Successfully upserted {len(ids)} documents to the collection: {collection.name}.")
    except Exception as e:
        logging.exception(f"Error upserting documents to the collection: {collection.name}.")
        raise
    update_last_successful_load(collection.name)
    return

def delete_documents_from_collection(collection, ids:list=[]) -> None:
    try:
        collection.delete(ids=ids)
        logging.info(f"Successfully deleted {len(ids)} documents from the collection: {collection.name}.")
    except Exception as e:
        logging.exception(f"Error deleting documents to the collection: {collection.name}.")
        raise
    return
        


In [74]:
def get_delta(): # Initialize a set to hold the distinct ruleNumbers
    updates = get_request(api_url="https://api.academyruins.com/diff/cr")
    distinct_rule_numbers = set()

        # Iterate over each entry in the data
    for entry in updates['changes']:
        try:
            if entry['old'] and 'ruleNumber' in entry['old']:
                distinct_rule_numbers.add(entry['old']['ruleNumber'])
            if entry['new'] and 'ruleNumber' in entry['new']:
                distinct_rule_numbers.add(entry['new']['ruleNumber'])

        # Print the distinct ruleNumbers
            
        except Exception as e:
            logging.info(f"Error deleting documents to the collection: {updates['changes']}.")
            continue

    for entry in updates['moves']:
        try:
            distinct_rule_numbers.add(float(entry['from']))
            distinct_rule_numbers.add(float(entry['to']))

        except Exception as e:
            logging.info(f"Error deleting documents to the collection: {updates['moves']}.")
            continue
        
    return distinct_rule_numbers


In [88]:
update_ids = get_delta()
data = delta_extract(update_ids)

2024-05-13 00:25:12,497 - root - INFO - Starting full-extraction
2024-05-13 00:25:12,497 - root - INFO - Starting full-extraction
2024-05-13 00:25:25,621 - root - INFO - Successfully extracted documents
2024-05-13 00:25:25,621 - root - INFO - Successfully extracted documents


In [90]:
update_ids = get_delta()
data = full_extract()

2024-05-13 00:25:57,213 - root - INFO - Starting full-extraction
2024-05-13 00:25:57,213 - root - INFO - Starting full-extraction
2024-05-13 00:25:57,416 - root - INFO - Successfully extracted chapter names
2024-05-13 00:25:57,416 - root - INFO - Successfully extracted chapter names
2024-05-13 00:25:57,574 - root - INFO - Successfully extracted rules
2024-05-13 00:25:57,574 - root - INFO - Successfully extracted rules


In [42]:
#newly added
for change in updates['changes']:
    
    try: 
        if (change['new'] is None):
                print( changechange
f"""
the following entries are or updated new:

ruleNumber: {change['new']['ruleNumber']}
ruleText: {change['new']['ruleText']}

"""
        )
# updated
        elif (change['old']['ruleNumber'] != change['new']['ruleNumber']):
                print( \
f"""
newRuleNumber: {change['new']['ruleNumber']}
newRuleText: {change['new']['ruleText']}

oldRuleNumber: {change['new']['ruleNumber']}
oldRuleText: {change['new']['ruleText']}

"""
        )
    except: 
        logging.exception(f"Error deleting documents to the collection: {change}.")
        continue




    try: 
        if (change['old']['ruleNumber'] != change['new']['ruleNumber']):
            print( \
    f"""
    ruleNumber: {change['old']['ruleNumber']}
    ruleText: {change['old']['ruleText']}

    ruleNumber: {change['new']['ruleNumber']}
    ruleText: {change['new']['ruleText']}

    """
            )
    except: 
        logging.exception(f"Error deleting documents to the collection: {change}.")
        continue
    



the following entries are or updated new:

ruleNumber: 116.2
ruleText: There are <<<<eleven>>>> special actions:



the following entries are or updated new:

ruleNumber: 116.2k
ruleText: A player who has a card with plot in their hand may exile that card. This is a special action. A player can take this action any time they have priority during their own turn while the stack is empty. See rule 702.170, “Plot.”



the following entries are or updated new:

ruleNumber: 205.3m
ruleText: Creatures and tribals share their lists of subtypes; these subtypes are called creature types. The creature types are Advisor, Aetherborn, Alien, Ally, Angel, Antelope, Ape, Archer, Archon, <<<<Armadillo,>>>> Army, Artificer, Assassin, Assembly-Worker, Astartes, Atog, Aurochs, Avatar, Azra, Badger, Balloon, Barbarian, Bard, Basilisk, Bat, Bear, Beast, <<<<Beaver,>>>> Beeble, Beholder, Berserker, Bird, Blinkmoth, Boar, Bringer, Brushwagg, Camarid, Camel, Capybara, Caribou, Carrier, Cat, Centaur, Cephalid,

In [52]:
for change in updates['changes']:
    try: 
        if (change['old']['ruleNumber'] != change['new']['ruleNumber']):
            print( \
    f"""
    ruleNumber: {change['old']['ruleNumber']}
    ruleText: {change['old']['ruleText']}

    ruleNumber: {change['new']['ruleNumber']}
    ruleText: {change['new']['ruleText']}

    """
            )
    except: 
        logging.exception(f"Error deleting documents to the collection: {change}.")
        continue

2024-05-12 23:12:29,021 - root - ERROR - Error deleting documents to the collection: {'old': None, 'new': {'ruleNumber': '116.2k', 'ruleText': 'A player who has a card with plot in their hand may exile that card. This is a special action. A player can take this action any time they have priority during their own turn while the stack is empty. See rule 702.170, “Plot.”'}}.
Traceback (most recent call last):
  File "/tmp/ipykernel_24490/2090292594.py", line 3, in <module>
    if (change['old']['ruleNumber'] != change['new']['ruleNumber']):
        ~~~~~~~~~~~~~^^^^^^^^^^^^^^
TypeError: 'NoneType' object is not subscriptable
2024-05-12 23:12:29,021 - root - ERROR - Error deleting documents to the collection: {'old': None, 'new': {'ruleNumber': '116.2k', 'ruleText': 'A player who has a card with plot in their hand may exile that card. This is a special action. A player can take this action any time they have priority during their own turn while the stack is empty. See rule 702.170, “Plot.”


    ruleNumber: 500.11
    ruleText: No game events can occur between <<<<turns,>>>> phases, or <<<<steps.>>>>

    ruleNumber: 500.12
    ruleText: No game events can occur between <<<<steps,>>>> phases, or <<<<turns.>>>>

    


In [45]:
collection.upsert(
    ids=['116.2'],
    documents=['There are <<<<drelf>>>> special actions:']
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.72it/s]


In [31]:
collection.get(ids=['116.2'])

{'ids': ['116.2'],
 'embeddings': None,
 'metadatas': [{'combined_title': 'Comprehensive Rules - 1 Game Concepts - 116 Special Actions',
   'documentType': 'rule',
   'sectionNumber': '1',
   'sectionTitle': 'Game Concepts',
   'subsectionNumber': '116',
   'subsectionTitle': 'Special Actions',
   'url': 'https://yawgatog.com/resources/magic-rules/#R1162'}],
 'documents': ['There are <<<<drelf>>>> special actions:'],
 'data': None,
 'uris': None}

In [10]:
############CONFIG############

chromadb_host:str = 'localhost'
chromadb_port:str =  '8000'
chromadb_device:str = 'cuda'
model_name:str = 'thenlper/gte-large'





collection = get_collection(
    collection_name='crRules',
    device='cuda'
)

status = get_collection_status(collection)
if status == 'full':
    rules, chapters = full_extract()
    documents = transform_data(rules, chapters)
    logging.info(f"Successfully created documents")
    collection = get_collection(device='cuda')
    logging.info(f"Successfully created collection")
    upsert_documents_to_collection(documents, collection)
    logging.info(f"Successfully upserted documents to collection")
    update_last_successful_load(collection.name)
    logging.info(f"Successfully updated logs")

elif status == 'delta':
    logging.info(f"Starting delta-load")
    rules, chapters = delta_extract()
    logging.info(f"Successfully extracted documents")
    documents = transform_data(rules, chapters)
    logging.info(f"Successfully created documents")
    collection = get_collection(device='cuda')
    logging.info(f"Successfully created collection")
    upsert_documents_to_collection(documents, collection)
    logging.info(f"Successfully upserted documents to collection")
    update_last_successful_load(collection.name)
    logging.info(f"Successfully updated logs")


elif status == None:
    logging.info(f"No update available")




2024-05-12 19:48:19,405 - root - INFO - Successfully extracted documents
2024-05-12 19:48:19,426 - root - INFO - Successfully created documents
2024-05-12 19:48:19,433 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-05-12 19:48:19,463 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
  from .autonotebook import tqdm as notebook_tqdm
2024-05-12 19:48:21,597 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-large
2024-05-12 19:48:22,321 - root - INFO - Successfully created collection
Batches: 100%|██████████| 102/102 [00:16<00:00,  6.08it/s]
2024-05-12 19:48:47,341 - root - INFO - Successfully upserted 3235 documents to the collection: crRules.
2024-05-12 19:48:47,343 - root - INFO - Successfully updated the last s

In [18]:
# check for updates
api_url: str = (
    "https://api.academyruins.com/diff/cr"
)
response = requests.get(api_url)
updates = response.json()


In [19]:
import datetime

def convert_to_timestamp(date_str, date_format = '%Y-%m-%d'):
    """
    date_format string assumes that the input is in the format 'YYYY-MM-DD'
    """
    
    # Parse the string into a datetime object
    datetime_obj = datetime.datetime.strptime(date_str, date_format)
    
    # Get a floating-point representation of the Unix timestamp
    timestamp = datetime_obj.timestamp()
    
    return timestamp

date_string = '2024-04-11'
timestamp = convert_to_timestamp(date_string)
print(timestamp)

1712786400.0


In [24]:
formatted_timestamp = float(format(time.time(), '.1f'))
formatted_timestamp


if timestamp < formatted_timestamp: 
    print("formatted_timestamp smaller")
else:
    print("test failed")

formatted_timestamp smaller


In [None]:
# perform update
## full extract & transform
## get only changed documents

In [15]:
# create update log dict


In [8]:
updates

{'sourceSet': 'Fallout',
 'sourceCode': 'PIP',
 'destSet': 'Outlaws of Thunder Junction',
 'destCode': 'OTJ',
 'creationDay': '2024-04-11',
 'changes': [{'old': {'ruleNumber': '116.2',
    'ruleText': 'There are <<<<ten>>>> special actions:'},
   'new': {'ruleNumber': '116.2',
    'ruleText': 'There are <<<<eleven>>>> special actions:'}},
  {'old': None,
   'new': {'ruleNumber': '116.2k',
    'ruleText': 'A player who has a card with plot in their hand may exile that card. This is a special action. A player can take this action any time they have priority during their own turn while the stack is empty. See rule 702.170, “Plot.”'}},
  {'old': {'ruleNumber': '205.3m',
    'ruleText': 'Creatures and tribals share their lists of subtypes; these subtypes are called creature types. The creature types are Advisor, Aetherborn, Alien, Ally, Angel, Antelope, Ape, Archer, Archon, Army, Artificer, Assassin, Assembly-Worker, Astartes, Atog, Aurochs, Avatar, Azra, Badger, Balloon, Barbarian, Bard, B

In [None]:
api_url: str = (
    "https://api.academyruins.com/cr/keywords"
)
response = requests.get(api_url)
response.json()


{'keywordAbilities': ['Deathtouch',
  'Defender',
  'Double Strike',
  'Enchant',
  'Equip',
  'First Strike',
  'Flash',
  'Flying',
  'Haste',
  'Hexproof',
  'Indestructible',
  'Intimidate',
  'Landwalk',
  'Lifelink',
  'Protection',
  'Reach',
  'Shroud',
  'Trample',
  'Vigilance',
  'Ward',
  'Banding',
  'Rampage',
  'Cumulative Upkeep',
  'Flanking',
  'Phasing',
  'Buyback',
  'Shadow',
  'Cycling',
  'Echo',
  'Horsemanship',
  'Fading',
  'Kicker',
  'Flashback',
  'Madness',
  'Fear',
  'Morph',
  'Amplify',
  'Provoke',
  'Storm',
  'Affinity',
  'Entwine',
  'Modular',
  'Sunburst',
  'Bushido',
  'Soulshift',
  'Splice',
  'Offering',
  'Ninjutsu',
  'Epic',
  'Convoke',
  'Dredge',
  'Transmute',
  'Bloodthirst',
  'Haunt',
  'Replicate',
  'Forecast',
  'Graft',
  'Recover',
  'Ripple',
  'Split Second',
  'Suspend',
  'Vanishing',
  'Absorb',
  'Aura Swap',
  'Delve',
  'Fortify',
  'Frenzy',
  'Gravestorm',
  'Poisonous',
  'Transfigure',
  'Champion',
  'Changelin

In [None]:
api_url: str = (
    "https://api.academyruins.com/cr/glossary"
)
response = requests.get(api_url)
response.json()

{'abandon': {'term': 'Abandon',
  'definition': 'To turn a face-up ongoing scheme card face down and put it on the bottom of its owner’s scheme deck. See rule 701.26, “Abandon.”'},
 'ability': {'term': 'Ability',
  'definition': '1. Text on an object that explains what that object does or can do.\n2. An activated or triggered ability on the stack. This kind of ability is an object.\nSee rule 113, “Abilities,” and section 6, “Spells, Abilities, and Effects.”'},
 'ability word': {'term': 'Ability Word',
  'definition': 'An italicized word with no rules meaning that ties together abilities on different cards that have similar functionality. See rule 207.2c.'},
 'absorb': {'term': 'Absorb',
  'definition': 'A keyword ability that prevents damage. See rule 702.64, “Absorb.”'},
 'activate': {'term': 'Activate',
  'definition': 'To put an activated ability onto the stack and pay its costs, so that it will eventually resolve and have its effect. See rule 602, “Activating Activated Abilities.”'

In [None]:
api_url: str = (
    "https://api.academyruins.com/cr/toc"
)
response = requests.get(api_url)
chapter_names = response.json()

In [None]:
chapters = {}

for section in chapter_names:
    section_title = section['title']
    for subsection in section['subsections']:
        combined_title = f"Comprehensive Rules - {section['number']} {section_title} - {subsection['number']} {subsection['title']}"
        chapters[f'{subsection["number"]}'] = combined_title

chapters


In [None]:
api_url: str = (
    "https://api.academyruins.com/cr/unofficial-glossary"
)
response = requests.get(api_url)
response.json()

{'addendum': {'term': 'Addendum',
  'definition': 'Addendum provides some benefit to the spell being cast if it was cast during your main phase. [Unofficial]'},
 'battalion': {'term': 'Battalion',
  'definition': 'Battalion refers to an ability like “Whenever this creature and at least two other creatures attack, [effect].” [Unofficial]'},
 'bloodrush': {'term': 'Bloodrush',
  'definition': 'Bloodrush refers to an ability like “[Cost], Discard this card: Target attacking creature gets [effect].” [Unofficial]'},
 'channel': {'term': 'Channel',
  'definition': 'Channel refers to an ability like “[Cost], Discard this card: [Effect].” [Unofficial]'},
 'chroma': {'term': 'Chroma',
  'definition': 'Chroma refers to an ability that counts the number of mana symbols of a given color in the mana costs of a set of objects. [Unofficial]'},
 'cohort': {'term': 'Cohort',
  'definition': 'Cohort refers to an ability like “{T}, Tap an untapped Ally you control: [Effect].” [Unofficial]'},
 'constellat