# Add value & effect retro-engineering

Goal: transform dataset into a new "pimped" dataset with values and effect values to make it easier to search specific uniques later

In [1]:
import polars as pl
import os
from datetime import datetime
import shutil
import re

In [2]:
dataset = pl.read_parquet("dataset.parquet")
dataset_pimped = dataset.clone()

In [3]:
def backup_dataset_pimped():
    # Créer le dossier de sauvegarde s'il n'existe pas
    backup_folder = "dataset_backup"
    os.makedirs(backup_folder, exist_ok=True)

    # Ajouter la date du jour au nom du fichier
    today_date = datetime.now().strftime("%Y-%m-%d")
    backup_file_name = f"dataset_pimped_{today_date}.parquet"
    backup_file_path = os.path.join(backup_folder, backup_file_name)

    # Copier le fichier
    shutil.copy("dataset_pimped.parquet", backup_file_path)

    print(f"Fichier sauvegardé sous : {backup_file_path}")

backup_dataset_pimped()

Fichier sauvegardé sous : dataset_backup\dataset_pimped_2025-04-16.parquet


## 1. Raw value

Calculation of the raw (without effects) values given mana costs and Forest/Mountain/Ocean powers

In [4]:
def calculate_value(input_df: pl.DataFrame, hand_reserve='hand'):
    """ calculate the raw value (cost / power)
     Args:
        input_df: the input polars DataFrame
        hand_reserve: return calculated 'hand' or 'reserve' (or both if not one of the 2) value """
    df = input_df
    main_value = df.select(
        ((pl.col('Fp') + pl.col('Mp') + pl.col('Op')) / 3 / pl.col('Mcost')).alias('value')
    )
    reserve_value = df.select(
        ((pl.col('Fp') + pl.col('Mp') + pl.col('Op')) / 3 / pl.col('Rcost')).alias('value')
    )
    
    if hand_reserve == 'hand':
        main_value = main_value
    elif hand_reserve == 'reserve':
        main_value = reserve_value
    else:
        main_value = ((main_value + reserve_value) / 2)
    return main_value

In [5]:
dataset_pimped = dataset.clone()

raw_HRval = calculate_value(dataset, hand_reserve=None).rename({"value": "raw_HRvalue"})
raw_Hval = calculate_value(dataset, hand_reserve='hand').rename({"value": "raw_Hvalue"})
raw_Rval = calculate_value(dataset, hand_reserve='reserve').rename({"value": "raw_Rvalue"})
dataset_pimped = dataset_pimped.with_columns(
    raw_HRval['raw_HRvalue'].round(2),
    raw_Hval['raw_Hvalue'].round(2),
    raw_Rval['raw_Rvalue'].round(2)
)

dataset_pimped.write_parquet("dataset_pimped.parquet") # save

## 2. Effects value

Calculation of the value considering the effects (texts) on the cards<br>
The text on the card can be 1 to 3 effects. The effects are built this way:<br>
Condition 1 (optionnal) + Condition 2 (optionnal) + Effect<br>
- Conditions can be classified this way :<br>
  - instant trigger	--> r (from reserve), h (from hand), j (from both)<br>
  - delayed trigger	--> at noon —, at dusk —<br>
  - activable trigger --> when (very interesting because it can be activated multiple times)<br>
  - reducer -->	you may, if (a simple transition to adding a 2nd condition, I call it reducers cause it makes it harder to get the effect)<br>
- Effects can also be what I call Pending effects, that always impact the board while controlled<br>
[gSheat helper](https://docs.google.com/spreadsheets/d/1s0N4AE5BEyj7s5jY2r8u5-8yk20lqbMfvkDhnW3zu9U/edit?gid=0#gid=0)

### 1. Extract all the conditions

Finding a way to automatically extract the conditions and the effect from the text

#### 1.0 separate effects
There could be one to three effects on the same card, so we have to separate them

In [144]:
def segregate_effects(text): 
    text = text.replace('.  ', '. ')    # some texts have 2 spaces after the dot, replace them with 1 space
    # split text by '.'
    effects_list = text.split('. ')
    
    # if after splitting, certain text appears, merge them with the previous effect (a kind of 3rd effect)
    out_lst = []
    for i in range(len(effects_list)):
        txt = effects_list[i]
        if (effects_list[i].startswith("if you")) or (effects_list[i].startswith("if it's")) or\
        (effects_list[i].startswith("you may")) or (effects_list[i].startswith("on a")) or\
        (effects_list[i].startswith("put one")) or (effects_list[i].startswith("it gains"))  or\
        (effects_list[i].startswith("when you roll a")) or (effects_list[i].startswith("exhaust it")) or\
        (effects_list[i].startswith("i gain x boosts")) or (effects_list[i].startswith("target character gain x boosts")) or\
        (effects_list[i].startswith("choose after")) or (effects_list[i].startswith("this effect")) or\
        (effects_list[i].startswith("unless it was in")) or (effects_list[i].startswith('"')):
            if out_lst:                 # ensure out_lst is not empty before popping
                txt = out_lst[-1] + '. ' + effects_list[i]
                out_lst.pop()           # remove the last entry
        out_lst.append(txt)
    return out_lst

apply it on the whole dataset

In [145]:
df_with_effect_lst = dataset_pimped.filter((pl.col('type') == 'CHARACTER') & (pl.col('rarity') == 'UNIQUE'))
df_with_effect_lst = df_with_effect_lst.with_columns(
    # creating a new column with the list of effects
    pl.col("MAIN_EFFECT").map_elements(segregate_effects, return_dtype=pl.List(pl.Utf8)).alias('effects_list')
)

In [118]:
for row in df_with_effect_lst.iter_rows(named=True):
    effects_list = row['effects_list']
    if effects_list is not None:
        for effect in effects_list:
            if '.  ' in effect:
                print(row)
                break
            if re.search(r'\.\s[^\w\s{}]', effect):
                print(row)
                special_character = re.search(r'\.\s([^\w\s{}])', effect).group(1)
                print(f"Special character found: {special_character}")
                break

in order to check our segregate_effects function we are going to test it on a small sample and also add a new col with the number of effects

In [146]:
seed = 1337

df_with_effect_lst = df_with_effect_lst.with_columns(   # adding the number of effect in a column
    pl.col("effects_list").list.len().alias('effects_number')
)

# In order to debug segregate_effects() function. Print cards with 3 more than 3 effects:
# print(f"MORE THAN 3 effects:")
# filtered_df = df_with_effect_lst.filter(pl.col('effects_number') > 3)
# for row in filtered_df.sample(n=4, seed=seed).iter_rows(named=True):
#     print(f"\n\t{row['name']} ({row['faction']}) {row['Mcost']}/{row['Rcost']} - {row['Fp']}/{row['Mp']}/{row['Op']} [{row['id']}]")
#     for effect in row['effects_list']:
#         print(f"\t\t{effect}")

print(f"3 effects:")
filtered_df = df_with_effect_lst.filter(pl.col('effects_number') == 3)
for row in filtered_df.sample(n=4, seed=seed).iter_rows(named=True):
    print(f"\n\t{row['name']} ({row['faction']}) {row['Mcost']}/{row['Rcost']} - {row['Fp']}/{row['Mp']}/{row['Op']} [{row['id']}]")
    for effect in row['effects_list']:
        print(f"\t\t{effect}")

print(f"\n2 effects:")
filtered_df = df_with_effect_lst.filter(pl.col('effects_number') == 2)
for row in filtered_df.sample(n=4, seed=seed).iter_rows(named=True):
    print(f"\n\t{row['name']} ({row['faction']}) {row['Mcost']}/{row['Rcost']} - {row['Fp']}/{row['Mp']}/{row['Op']} [{row['id']}]")
    for effect in row['effects_list']:
        print(f"\t\t{effect}")

print(f"\n1 effect:")
filtered_df = df_with_effect_lst.filter(pl.col('effects_number') == 1)
for row in filtered_df.sample(n=4, seed=seed).iter_rows(named=True):
    print(f"\n\t{row['name']} ({row['faction']}) {row['Mcost']}/{row['Rcost']} - {row['Fp']}/{row['Mp']}/{row['Op']} [{row['id']}]")
    for effect in row['effects_list']:
        print(f"\t\t{effect}")

print(f"\nNo effect:")
filtered_df = df_with_effect_lst.filter(pl.col('MAIN_EFFECT').is_null())
for row in filtered_df.sample(n=4, seed=seed).iter_rows(named=True):
    print(f"\n\t{row['name']} ({row['faction']}) {row['Mcost']}/{row['Rcost']} - {row['Fp']}/{row['Mp']}/{row['Op']} [{row['id']}]")

3 effects:

	The Sandman (OR) 3/2 - 0/2/2 [ALT_CORE_B_LY_15_U_6925]
		{h} if you control four or more characters: draw a card
		{h} sabotage
		{r} unless you have eight or more mana orbs: you may have target character gain asleep.

	Red (BR) 3/3 - 2/1/1 [ALT_CORE_B_BR_10_U_3575]
		i am seasoned
		{h} if you control one or more landmarks: i gain 2 boosts
		at dusk — if i have 2 or more boosts: draw a card.

	Amarok (BR) 5/4 - 5/3/3 [ALT_ALIZE_B_BR_39_U_17588]
		{j} if i'm the only character in my expedition: i gain 2 boosts
		when another character joins my expedition — sacrifice a character in my expedition
		at dusk — if your hand is empty: put the top card of your deck in your mana zone as an exhausted mana orb.

	Red (YZ) 3/3 - 1/3/3 [ALT_CORE_B_BR_10_U_2654]
		i am seasoned
		{r} you may put a card from your hand in reserve. if it's a spell: the next spell you play this afternoon costs {1} less
		when i go to reserve from the expedition zone — if i have 2 or more boosts: you may di

#### 1.1 separate conditions and effects

now that we have separated text into 0 to 3 effects, we have to separate conditions and effects

conditions :

In [147]:
conditions_starters = [
    r'(you may|if|when|unless) (.*?)(?::|—|\. )', # reducers
]

all_conditions = []
for row in df_with_effect_lst.iter_rows(named=True):
    effects_list = row['effects_list']
    if effects_list is None:
        continue
    for effect in effects_list:
        for condition_starter in conditions_starters:
            found = re.findall(condition_starter, effect)
            if found:
                for f in found:
                    txt = f[0] + ' ' + f[1]
                    all_conditions.append(txt)

# Get unique values from all_conditions list
unique_conditions = sorted(list(set(all_conditions)))
# Add r'\{j\}', r'\{h\}', r'\{r\}' to unique_conditions
unique_conditions.extend(['{j}', '{h}', '{r}', 'at dusk —', 'at noon —', 'after rest', 'roll a die. on a 4+'])
unique_conditions


['if each of your expeditions is behind or tied',
 'if i have 1 or more boosts',
 'if i have 2 or more boosts',
 'if i have 3 or more boosts',
 'if i would gain asleep, i gain anchored instead',
 'if i would gain fleeting, i gain 1 boost instead',
 "if i'm in {m}",
 "if i'm in {v}",
 "if i'm not fleeting",
 "if i'm the only character in my expedition",
 "if it's a permanent",
 "if it's a spell",
 'if my expedition is behind',
 'if the expedition facing me is in {o}, it can only move forward due to {o}',
 'if the expedition facing me is in {v}, it can only move forward due to {v}',
 "if there are no characters in the expedition i'm played in",
 'if there are three or more base statistics of 0 among characters you control',
 'if there are two or more exhausted cards in reserve',
 "if you can't",
 'if you control a fleeting character',
 'if you control a token',
 'if you control four or more characters',
 'if you control one or more landmarks',
 'if you control three or more characters',


effect :

In [161]:
effects_starters = [
    r'(\{j\}|:) (.*?)(?:\.|if |unless |you may )',
    # r'(:) (.*?)(?:\.|if |unless |you may )'
    # r'(you may) (.*)$'
]

# Iterate through all rows in uniques_df
all_effects = []

# for row in uniques_df.sample(n=20, seed=1773).iter_rows(named=True):
for row in df_with_effect_lst.iter_rows(named=True):
    effects_list = row['effects_list']
    if effects_list is None:
        continue
    for effect in effects_list:
        for effect_starter in effects_starters:
            found = re.findall(effect_starter, effect)
            if found:
                for f in found:
                    txt = f[1]
                    all_effects.append(txt)

# Get unique values from all_effects list
all_effects.append("you may send to reserve target character with hand cost 4 or more")
all_effects.append("you may send target character to reserve")
unique_effects = sorted(list(set(all_effects)))
unique_effects = [effect for effect in unique_effects if effect != 'roll a die']
if '' in unique_effects:
    unique_effects.remove('')
unique_effects

['"{r} i gain 1 boost',
 'abilities',
 'abilities of target permanent you control',
 'abilities of up to two target permanents you control',
 'all characters in target expedition gain asleep',
 'any number of target characters in {v} gain 2 boosts',
 'cards other than me cost {1} less to play from reserve',
 'cards other than me cost {1} more to play from reserve',
 "cards your opponents play can't cost less than {2}",
 'characters you control gain 1 boost',
 "characters your opponents play can't cost less than {2}",
 'create a brassbug 2/2/2 robot token in each of your expeditions',
 'create a brassbug 2/2/2 robot token in my expedition',
 'create a brassbug 2/2/2 robot token in target expedition',
 'create a brassbug 2/2/2 robot token in your companion expedition',
 'create a brassbug 2/2/2 robot token in your hero expedition',
 "create a brassbug 2/2/2 robot token in your other expedition the one i'm not in",
 'create a mana moth 2/2/2 illusion token in each of your expeditions',
 '

#### 1.2 separate conditions and effects and add a new column

creating a new column with the list of effects with conditions and effect well identified

In [169]:
# using unique_conditions & unique_effects to segment effects texts
def segment_effects_old_old(effects_lst, known_conditions, known_effects):
    out_lst = []
    for text in effects_lst:
        dict = {"cond1": "", "cond2": "", "effect": ""}
        
        # 0. get the first condition
        for condition in known_conditions:
            if text.startswith(condition):
                dict["cond1"] = condition
                break

        # 1. get the second potential condition
        remaining_text = text[len(dict["cond1"]):].strip()
        remaining_text = re.sub(r'^[^a-zA-Z0-9]+', '', remaining_text)  # clean
        for second_condition in known_conditions:
            if remaining_text.startswith(second_condition):
                dict["cond2"] = second_condition
                break

        # 3. get the effect
        remaining_text2 = remaining_text[len(dict["cond2"]):].strip()
        remaining_text2 = re.sub(r'^[^a-zA-Z0-9]+', '', remaining_text2)  # clean
        if remaining_text2.endswith('.'):
            remaining_text2 = remaining_text2[:-1]
        for effect in known_effects:
            if remaining_text2.startswith(effect):
                dict["effect"] = effect
                break

        out_lst.append(dict)
    return out_lst

def segment_effects_old(effects_lst, known_conditions, known_effects):
    out_lst = []
    print(f'\nNew card:')
    for text in effects_lst:
        dict = {"cond1": "", "cond2": "", "cond3": "", "effect": ""}
        
        # 0. extract conditions
        print(f'\tNew text:')
        matches = []
        for condition in known_conditions:
            matches.append(re.search(condition, text))
        matches = [match for match in matches if match]
        matches.sort(key=lambda x: x.span()[0])
        if len(matches) == 4:
            dict["effect"] = matches[3].group()
        # [print(f'\t\t{match.group()} {match.span()}') for match in matches]
        for i, match in enumerate(matches):
            dict[f"cond{i+1}"] = match.group()
        
        # 1. extract effect
        if dict["effect"] == "":
            matches = []
            for effect in known_effects:
                matches.append(re.search(effect, text))
            matches = [match for match in matches if match]
            matches.sort(key=lambda x: x.span()[0])
            print(matches)
            for i, match in enumerate(matches):
                dict["effect"] = match.group()
            
        if dict["effect"] == "":
            # print(f'\t\t{text}\n\t\tdict: {dict}')
            if dict[f"cond{3}"] != "":
                dict["effect"] = dict[f"cond{3}"]
                dict[f"cond{3}"] = ""
            else:
                if dict[f"cond{2}"] != "":
                    dict["effect"] = dict[f"cond{2}"]
                    dict[f"cond{2}"] = ""

        if dict["effect"] == "":
            print(f'\t\ttext: {text}\n\t\tdict: {dict}\t\t matches: {matches}')

        out_lst.append(dict)
    return out_lst

def segment_effects(effects_lst, known_conditions, known_effects):
    out_lst = []
    # print(f'\n\nNew card:')
    for text in effects_lst:
        text = text.lstrip()
        if text.endswith('.'):
            text = text[:-1]
        text = re.sub(r'\{(\d+)\}', r'\1', text)

        # print(f'\n\tText: {text}')
        dict = {"cond1": "", "cond2": "", "cond3": "", "effect": ""}
        
        # extract conditions
        cond_matches = []
        for condition in known_conditions:
            cond_matches.append(re.search(condition, text))
        cond_matches = [match for match in cond_matches if match]
        cond_matches.sort(key=lambda x: x.span()[0])
        # extract effect
        effect_matches = []
        for effect in known_effects:
            effect = effect.lstrip()
            if effect.endswith('.'):
                effect = effect[:-1]
            effect = re.sub(r'\{(\d+)\}', r'\1', effect)
            effect_matches.append(re.search(effect, text))
        effect_matches = [match for match in effect_matches if match]
        effect_matches.sort(key=lambda x: x.span()[0])
        # print(f'\tcond_matches: {[i.group() for i in cond_matches]}\n\teffect_matches: {[i.group() for i in effect_matches]}')
        
        # if dict["effect"] == "":
        # if effect_matches == []:
        #     print(f'\nText: {text}\ncond_matches: {[i.group() for i in cond_matches]}\neffect_matches: {[i.group() for i in effect_matches]}')

        out_lst.append(dict)
    return out_lst

In [170]:
# df_with_effect_lst_seg = df_with_effect_lst.sample(200, seed=1337).with_columns(      # use sample 200 for debugging purpose
df_with_effect_lst_seg = df_with_effect_lst.with_columns(
    # creating a new column with the list of effects
    pl.col("effects_list").map_elements(
        lambda effects_lst: segment_effects(effects_lst, unique_conditions, unique_effects),
        return_dtype=pl.List(pl.Struct([
            pl.Field("cond1", pl.Utf8), pl.Field("cond2", pl.Utf8), pl.Field("cond3", pl.Utf8), pl.Field("effect", pl.Utf8)
        ]))
    ).alias('effects_list_segregated')
)

KeyboardInterrupt: 