# Imports

In [1]:
import torch
import scipy
import numpy as np

# State space
We need to specify the size of the state space.
Suppose the value of a worm is $6$.

We define two kinds of state, two kinds of action, and two kinds of transition matrices : the state in which we need to choose whether or not to throw the remaining dice, and the state in which we need to decide which dice to pick.


A state can be described by:
1. The current sum of values of the dice drawn.
2. The values of dice already picked
3. Nb de dés restants
4. Le fait qu'on ait fait un tirage ou pas, et la valeur du tirage le cas échéant

For state $s_1$:

1. Current value: $49$ possible values + $1$ absorbing state
2. Current dice values picked: $2^6$ possible values
3. Number of remaining dice: $9$ possible values

total : $28224$

For state $s_2$:
1. Current value: $49$ possible values
2. Current dice values picked: $2^6$ possible values
3. Number of remaining dice: $9$ possible values
4. Dice drawn: $\binom{14}{6}$ possible values (sticks and stones)

total : $49\times 2^6 \times 9 \times \binom{14}{6}= 84,756,672$

Possible actions after $s_1$:

1. Throw the dice or not

total: $2$

Possible actions after $s_2$:

1. Pick any of the numbers. If we pick an unavailable value, we lose (negative reward + we get into the absorbing state with $0$ dice remaining)

total: $6$


On ne peut pas matérialiser la matrice de transition de l'état $s_1$ à l'état $s_2$: ça ferait
$(49\times 2^6 \times 9)^2 \times \binom{14}{6}*2 = 4.784344621056 × 10^{12}$

Mais on n'est pas obligés d'avoir $2$ matrices d'états différents : on peut obliger l'agent à prendre la décision sans connaître le résultat du lancer.

On suppose qu'un tour est composé de $6\times 2$ actions, car il y a seulement $6$ valeurs de dés possibles.

Il faut donc calculer pour chacun des $6$ tours, la valeur de chaque état récursivement.

On doit créer un état absorbant qui signifie que le joueur a perdu suite à une action interdite

In [4]:
interval_1 = int(scipy.special.binom(9,1))# Etats où on a de 0 à 8 fois le dé 1
interval_2 = int(scipy.special.binom(10,2))# Etats où on a de 0 à 8 fois le dé 1 et de 0 à 8 fois le dé 2
interval_3 = int(scipy.special.binom(11,3))# Etats où on a de 0 à 8 fois le dé 1 et de 0 à 8 fois le dé 2 et de 0 à 8 fois le dé 3
interval_4 = int(scipy.special.binom(12,4))
interval_5 = int(scipy.special.binom(13,5))
interval_6 = int(scipy.special.binom(14,6))

In [89]:

intervals_6 = [int(scipy.special.binom(i,5)) for i in range(13,5,-1)]
c_intervals_6 = np.cumsum(intervals_6)

In [9]:
def index_partition():
    """Via la hockey stick identity, on construit l'ensemble des jetés de dés."""
    dict_index = {}
    for i_0 in range(9):
        for i_1 in range(9-i_0):
            for i_2 in range(9-i_0-i_1):
                for i_3 in range(9-i_0-i_1-i_2):
                    for i_4 in range(9-i_0-i_1-i_2-i_3):
                        for i_5 in range(9-i_0-i_1-i_2-i_3-i_4):
                            i_6 = 8-i_0-i_1-i_2-i_3-i_4-i_5
                            dict_index[((i_0,i_1,i_2,i_3,i_4,i_5,i_6))] = len(dict_index)
    return dict_index

In [13]:
dict_index = index_partition()

In [19]:
dict_number = {v: k for k, v in dict_index.items()}

In [51]:
list_numbers = list(dict_number.values())

In [29]:
reward_vector = torch.zeros(50)
reward_vector[21:25] = 1
reward_vector[25:29] = 2
reward_vector[29:33] = 3
reward_vector[33:37] = 4

Il y a deux cas : soit on choisit un nombre inacceptable et on va dans l'état $50$, soit on choisit un nombre acceptable, et en fonction du nombre choisi, on doit calculer la nouvelle valeur totale que l'on a, le nouveau nombre de dés restants, et mettre à jour les dés déjà obtenus (en passant simplement de l'état $n$ à $n+2^{c-1}$ où $c$ est la valeur du dé choisi)

In [213]:
def optimal_choice_1(state, action):
    # state is a vector (value, dice_values_picked, remaining_dice)
    # action is a value 0-1, representing whether to continue or not.
    value, dice_values_picked, dice_owned, draw_result = state
    dice_left = 8 - dice_owned
    if action == 0 or dice_left == 0:
        if dice_values_picked[-1] == 0:
            # We choose to stop but have no pickominos
            return c
        else:
            # We have found at least one pickomino
            return reward_vector[value]
    elif action == 1:
        # We choose to continue and have remaining dice
        list_draws = list_numbers[c_intervals_6[dice_owned]:c_intervals_6[dice_owned+1]]
        list_values = []
        for draw in list_draws:
            new_state = value, dice_values_picked, dice_owned, draw
            maximum_value = max([optimal_choice_2(new_state, i) for i in range(1,7)])
            list_values.append(maximum_value)
        print(list_values)
        print(f"{np.mean(list_values)=}")
        return np.mean(list_values)
        

def optimal_choice_2(state, action):
    # state is a vector (value, dice_values_picked, remaining_dice, draw_result)
    # here dice_values_picked is a vector of 6 values 0-1, representing whether the dice has been picked or not.
    # action is a number 0-6 representing the choice of dice to keep, or whether to not pick any dice.
    value, dice_values_picked, dice_owned, draw_result = state
    draw_values = draw_result
    if draw_values[action] == 0:
        return c
    else:
        if dice_values_picked[action-1] == 1:
            return c
        else:
            dice_values_picked[action-1] = 1
            new_state = value + min(action,5)*draw_values[action], dice_values_picked, dice_owned + draw_values[action], draw_result
            choices = [optimal_choice_1(new_state, 0), optimal_choice_1(new_state, 1)]
            return max(choices)


In [147]:
c = -3

In [155]:
def max_start_1(state):
    list_action_results = [optimal_choice_1(state, action) for action in range(2)]
    return max(list_action_results)

def max_start_2(state):
    list_action_results = [optimal_choice_2(state, action) for action in range(1,7)]
    return max(list_action_results)

In [215]:
c = 0

In [216]:
value_2 = 20
dice_values_picked_2 = [0,0,1,1,0,1]
dice_owned_2 = 4
draw_result_2 = (6,0,0,0,0,0,2)

state_2 = value_2, dice_values_picked_2, dice_owned_2, draw_result_2
max_start_1(state_2)

[0, 0, 0, 0, 0, 0]
np.mean(list_values)=0.0
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, tensor(2.), 0, 0, 0, 0, 0]
np.mean(list_values)=0.09523809523809523
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, tensor(2.), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.mean(list_values)=0.03571428571428571
[0, tensor(2.), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
np.mean(list_values)=0.015873015873015872


0.015873015873015872