# Exam Planning-Lab January 2023

In [None]:
import os, sys
module_path = os.path.abspath(os.path.join('tools'))
if module_path not in sys.path:
    sys.path.append(module_path)

import gym, envs
from utils.ai_lab_functions import *
import numpy as np

## Exercise 1

<img src="images/ex1_.png" alt="ex1" style="zoom: 30%;" />

Consider the mobility graph in the figure above, where $S$ and $G$ are the start and goal positions, respectively, and $W_{i,j}$ represents the moving cost between the node $i$ and $j$. Assume all the $W_{i,j}$ are all integers and strictly positive. Assume the taxi can move in the following directions $<"L", "R", "U", "D">$. Note that there is a cost even in the case of an action that causes the agent to remain on the same starting state (e.g., if from state $0$ the agent performs action $L$ remains in $0$ with cost $W_{0,0}$).

Answer the following questions:

### 1.1) Suppose there are no people in the grid world and a unitary cost for every $W_{i,j}$. Implement the A* search strategy using an admissible and consistent heuristic for the problem of finding a minimum-cost path (ignoring people requests). Show the result of the execution of your code and the total path cost.

In [None]:
# Please note that you need to modify the code developed in the lab to consider the cost of each displacement 
# (hint: when you create the child node) defined in the next cell and passed as a parameter to your method. 

def present_with_higher_cost(queue, node):
    if node.state in queue:
        if queue[node.state].value > node.value: 
            return True
    return False


def astar_graph(environment, weights):
    """
    A* Graph Search
    
    Args:
        problem: OpenAI Gym environment
        
    Returns:
        (path, time_cost, space_cost): solution as a path and stats.
    """

    goalpos = environment.goalstate
    print(goalpos)
    explored = set()
    frontier = PriorityQueue()
    node = Node(environment.startstate,None)
    print(node.state)
    frontier.add(node)
    
    time_cost = 1
    space_cost = 1
    
    while True:

        if len(frontier) == 0:
            return None, time_cost, space_cost
            
        space_cost = max(space_cost,len(frontier)+ len(explored))
        parent = frontier.remove()
        explored.add(parent.state) # addo a explored 
        if parent.state == goalpos: # END 
            return parent.pathcost, build_path(parent), time_cost, space_cost
        
        for action in environment.actions:
            time_cost+=1
            child_state= environment.sample(parent.state, action)
            heuristic_value= Heu.l1_norm(environment.state_to_pos(child_state),environment.state_to_pos(goalpos))
            child = Node(child_state,parent,parent.pathcost+ weights[parent.state,child_state],heuristic_value+ parent.pathcost + weights[parent.state,child_state])
            if (child.state not in frontier) and (child.state not in explored): 
                frontier.add(child)
            elif present_with_higher_cost(frontier, child):
                frontier.replace(child) # PRIORITY QUEUE REPLACE FUNCTION, replaces the node with the same value, that functions as an ID   
  




**Results of your solution:**

In [None]:
'''
This dictionary stores all the moving costs for the environment depicted above. In particular,
each key (first element of the dictionary) is a tuple that consists of (start_state, goal_state). 
The value, i.e., the second element of the dictionary, is the moving cost.

'''


weights = {(0,1): 1, # start from state 0 and reach state 1, with cost 1
           (1,0): 1, # start from state 1 and reach state 0, with cost 1
           (1,2): 1,
           (2,1): 1,
           (2,5): 1,
           (5,2): 1,
           (5,4): 1,
           (4,5): 1,
           (4,1): 1,
           (1,4): 1,
           (4,3): 1,
           (3,4): 1, 
           (3,0): 1,
           (0,3): 1,
           (0,0): 1, # this key-value pair encodes the situation in which the agent starts from state 0 
                     # and performs an action that force it to remain in the same state, but accumulates 
                     # a unit cost.
           (1,1): 1,
           (2,2): 1,
           (3,3): 1,
           (4,4): 1,
           (5,5): 1,
          }


env = gym.make('SmallMaze-v0')
total_path_cost, solution, time, memory = astar(env,weights) 
print("Total path cost: ", total_path_cost)
print("Solution A*: {}".format(solution_2_string(solution, env)))
print("N° of nodes explored: {}".format(time))
print("Max n° of nodes in memory: {}\n".format(memory))

### 1.2) By implementing a function, verify if the heuristic chosen is admissible and consistent. Briefly motivate, with a comment at the beginning of the function, the process to check the consistency.

In [None]:
"""
ANSWER:

I chosed the l1 heuristic because during the lab session we discovered it is dominant
with respect to l2 and chebyshev. 
An heuristic h1 is dominant with respect to an heuristic h2 if for every state h1(s) >= h2(s).
If an heuristic is dominant it's better and resolves the problem in less time.
I decided to give the proof only for consistency because if an heuristic is consistent it's also admissable.
Consistency for an heuristic h1 can be defined as follows:
    h1(s) <= c(s,a,s1) + h1(s1)

"""
def consistency_control(environment,weights):
    goalpos = environment.state_to_pos(environment.goalstate)

    for s in range(environment.observation_space.n):
        if environment.grid[s] == "W": continue 
        for action in environment.actions:

            s1 = environment.sample(s,action)
            hn_s = Heu.l1_norm(environment.state_to_pos(s), goalpos)
            hn_s1 = Heu.l1_norm(environment.state_to_pos(s1), goalpos)

            if not (hn_s - hn_s1 <= weights[s,s1]): # action_cost = 1 
                print("The L1 heuristic is not Consistent")
                return False 

    print("The L1 heuristic Manhattan distance is Consistent")
    return True 





env = gym.make('SmallMaze-v0')
consistency_control(env,weights)

### 1.3) Give a weight allocation such that the best solution achieved with A* (tree search version) strategy goes through all the nodes with passengers and reaches the node goal $𝐺$ (i.e., that the agent is forced to go through node $id=3$ and node $id=2$, before reaching the goal node). Show with your code the results of the execution and the total path cost.

In [None]:
def astar_tree_search(environment, weights):
    """
    A* Tree search
    
    Args:
        problem: OpenAI Gym environment
        
    Returns:
        (path, time_cost, space_cost): solution as a path and stats.
    """
    goalpos = environment.goalstate
    frontier = PriorityQueue()
    node = Node(environment.startstate,None)
    frontier.add(node)
    
    time_cost = 1
    space_cost = 1
    
    while True:
        if len(frontier) == 0:
            return None, time_cost, space_cost
        space_cost = max(space_cost,len(frontier))
        
        parent = frontier.remove()
        if parent.state == goalpos:
            return parent.pathcost, build_path(parent), time_cost, space_cost
        
        for action in environment.actions:
            time_cost+=1
            child_state= environment.sample(parent.state, action)
            heuristic_value= Heu.l1_norm(environment.state_to_pos(child_state),environment.state_to_pos(goalpos))
            child = Node(child_state,parent,parent.pathcost +weights[parent.state,child_state],heuristic_value+ parent.pathcost + weights[parent.state,child_state])
            frontier.add(child)
            if present_with_higher_cost(frontier, child):
                frontier.replace(child) 
    

**Results of your solution:**

In [None]:
env = gym.make('SmallMaze-v0')

# your code for the weight allocation here
weights[0,1] = 6 
weights[4,1] = 6 

total_path_cost, solution, time, memory = astar_tree_search(env,weights) # your code here
print("Total path cost: {}".format(total_path_cost))
print("Solution A*(tree-search): {}".format(solution_2_string(solution, env)))
print("N° of nodes explored: {}".format(time))
print("Max n° of nodes in memory: {}\n".format(memory))

### 1.4) Is the heuristic originally chosen still consistent in this case? Motivate your answer with the code.

In [None]:
"""
The heuristic is still consistent because it does not overextimate the move_cost 
value defined by the weights.
We can check it easily by doing the operation on the only two action changed:
    
    h(s)<= c(s,a,s1) + h(s1) 
    
    1 <= 6+0  (for state 0 moving into 1 )
    1 <= 6+0  (for state 4 moving into 1 )

"""
consistency_control(env,weights)

### 1.5) Considering the task of transporting all passengers to the final location and the results obtained in the previous point, can you provide a different approach that finds a solution which guarantees least cost exploring less nodes? Motivate your answer by showing possible improvements with different strategies.

In [None]:
""" 
ANSWER:

By using A* with graph search it makes drastics improvements to the costs by adding
an explored set for state values.
"""


**Results of your solution:**

In [None]:
env = gym.make('SmallMaze-v0')

# your code for the weight allocation here (should be the same as the previous point)
# So there is no need to reallocate them right ??? 

# WE ARE USING A* GRAPH SEARCH 
total_path_cost, solution, time, memory = astar_graph(env,weights) 
print("Total path cost: {}".format(total_path_cost))
print("Solution: {}".format(solution_2_string(solution, env)))
print("N° of nodes explored: {}".format(time))
print("Max n° of nodes in memory: {}\n".format(memory))

### 1.6) Implement a Greedy Best First approach for this particular environment. With this strategy, can we obtain the desired solution (i.e., all the individuals transferred to the goal node $G$)?

In [None]:

"""
There is no point in using greedy bfs because it doesnt care about the weights when computing 
the value of the node, in other words it cant reach the goal after taking all the passengers. 
"""

def greedy_graph_search(environment,weights): 

    goalpos = environment.goalstate
    print(goalpos)
    explored = set()
    frontier = PriorityQueue()
    node = Node(environment.startstate,None)
    print(node.state)
    frontier.add(node)
    
    time_cost = 1
    space_cost = 1
    
    while True:
        if len(frontier) == 0:
            return None, time_cost, space_cost
            
        space_cost = max(space_cost,len(frontier)+ len(explored))
        parent = frontier.remove()
        if parent.state == goalpos: 
            return parent.pathcost, build_path(parent), time_cost, space_cost
        explored.add(parent.state) 
        
        for action in environment.actions:
            time_cost+=1
            child_state= environment.sample(parent.state, action)
            heuristic_value= Heu.chebyshev(environment.state_to_pos(child_state),environment.state_to_pos(goalpos))
            child = Node(child_state,parent,parent.pathcost+ weights[parent.state,child_state],heuristic_value)
            if (child.state not in frontier) and (child.state not in explored): 
                frontier.add(child)
            elif present_with_higher_cost(frontier, child):
                frontier.replace(child) 





**Results of your solution:**

In [None]:
env = gym.make('SmallMaze-v0')


# your code for the weight allocation here

total_path_cost, solution, time, memory = greedy_graph_search(env,weights) 
print("Total path cost: {}".format(total_path_cost))
print("Solution Greedy Best First: {}".format(solution_2_string(solution, env)))
print("N° of nodes explored: {}".format(time))
print("Max n° of nodes in memory: {}\n".format(memory))

**Motivate your results:**

As explained above the code the Greedy bfs only cares about the value of the heuristic when calculating the value of the node. This makes it impossible for the algorithm to pick up all passengers before heading to the goal because it ***doesnt take in consideration the changes we applied to the weights***.

### 1.7) Could you modify the environment maintaining the same weight allocation so that the manhattan distance is no loger consistent? Note that the problem should remain a single state problem.

if we add walls maybe ? 
COME BACK LATER TO THIS QUESTION

## Exercise 2

<img src="images/frozen.png" alt="ex1" style="zoom: 30%;" />

Winter is here. You and your friends were tossing around a frisbee at the park when you made a wild throw that left the frisbee out in the middle of the lake. The water is mainly frozen, but there are a few holes (highlighted with the letter 𝐻 in the picture) where the ice has melted. Hence, you decide to use your robotic agent to retrieve the frisbee. However, if it steps into one of those holes, it will fall into the freezing water, and you lose both the robotic agent and the frisbee forever.

Suppose the transition model for the environment is unknown. The return is discounted $\gamma = 0.9$; the episode ends when the agent reaches the goal or falls into a hole. Moreover, the agent receives a reward of $+1$ if it retrieves the frisbee, i.e., it reaches the cell $(3,3)$, $0$ otherwise.


### 2.1) Given the environment reported above, implement an algorithm that can find the optimal policy, i.e., that achieves a 100% of success rate. Test the resulting policy using the provided code. ( For the action choice, you can use the explor_fun provided here below)

**Utility functions provided for you:**

In [None]:
def explor_fun(env, q, state):
    
    # Choose the action with the highest value in the current state
    if np.max(q[state]) > 0:
        action = np.argmax(q[state])

    # If there's no best action (only zeros), take a random one
    else:
        action = env.action_space.sample()
        
    return action


def test_success(environment, qtable):
    episodes = 100
    nb_success = 0

    # Evaluation
    for _ in range(100):
        state = environment.reset()
        done = False

        # Until the agent gets stuck or reaches the goal, keep training it
        while not done:

            #select the action using the explore function provided above
            action = explor_fun(environment, qtable, state)

            # perform the action in the environment
            new_state, reward, done, info = environment.step(action)

            # Update our current state
            state = new_state

            if done and reward > 0:
                nb_success += reward

    # Let's check our success rate!
    print (f"Success rate = {nb_success/episodes*100}%")
    
    
def execute_policy(environment, qtable):
    import time
    
    state = environment.reset()
    done = False
    sequence = []

    while not done:

        #select the action using the explore function provided above
        action = explor_fun(environment, qtable, state)

        # Add the action to the sequence
        sequence.append(action)

        # perform the action in the environment
        new_state, reward, done, info = environment.step(action)

        # Update our current state
        state = new_state

        # Update the render
        environment.render()
        time.sleep(1)

    actions = ["L", "D", "R", "U"]
    print(f"Sequence = {[actions[a] for a in sequence]}")

**Your solution:**

In [81]:
"""
We use sarsa because, by being on policy, it wont make the greediest choice like q-learning
and will calculate the new best action based on epsilon greedy.
I choosed SARSA because it's safer for the robot during the training stage and it'll be 
less likely that it will fall in the holes on the ice surface.
"""

def sarsa(environment, episodes, alpha, gamma, expl_func):

    q = np.zeros((environment.observation_space.n, environment.action_space.n))  # Q(s, a)
    rews = np.zeros(episodes)
    lengths = np.zeros(episodes)

    for episode in range(episodes):
        state = env.reset()
        a = expl_func(environment,q,state)
        while True:
            next_state, reward, done, _ = env.step(a)  # Execute a step
            a_1 = expl_func(environment,q,next_state)
            rews[episode] += reward
            lengths[episode]+= 1
            q[state,a] = q[state,a] + alpha*(reward + gamma*q[next_state,a_1]  - q[state,a])
            if done: 
                break
            state = next_state
            a = a_1

    policy = q.argmax(axis=1) # q.argmax(axis=1) automatically extract the policy from the q table
    return q,policy 


In [87]:
environment = gym.make("FrozenLake-v0", is_slippery=False)

# Hyperparameters
episodes = 100000        #bTotal number of episodes
alpha = 0.5            # Learning rate
gamma = 0.9            # Discount factor

qtable, policy = sarsa(environment,episodes,alpha,gamma,explor_fun)
test_success(environment, qtable)

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
Sequence = ['D', 'R']
Success rate = 1.0%


**See the execution of your trained agent in the environment:**

In [88]:
execute_policy(environment, qtable)

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Up)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
Sequence = ['D', 'L', 'L', 'D', 'R', 'R', 'U', 'R']


### 2.2) Consider and environment with states {A, B, C, D}, actions {r, l} where states {A, D} are terminal. Consider the following sequence of learning episodes:
* E1: (B, r, C, −1),(C, r, C, −1),(C, r, D, +1)
* E2: (B, r, C, −1),(C, r, D, +1)
* E3: (B, l, A, +5)
* E4: (B, l, B, −1),(B, l, B, −1),(B, l, A, +5)

### Build an estimation of the transition model T using the code provided below.

#### hint: The transition model T is a probability distribution that describes the likelihood of transitioning from one state to another given an action. We can estimate T from a sequence of learning episodes by counting the number of times each transition occurs and dividing by the total number of transitions for a given state-action pair.

In [101]:
episodes = {1: [('B', 'r', 'C', -1), ('C', 'r', 'C', -1),('C', 'r', 'D', 1)], 
            2: [('B', 'r', 'C', -1), ('C', 'r', 'D', 1)], 
            3: [('B', 'l', 'A', 5)], 
            4: [('B', 'l', 'B', -1),('B', 'l', 'B', -1),('B', 'l', 'A', 5)]
           }


#T:

count_brc = 0
tot_count_br = 0

count_bla = 0
count_blb = 0
tot_count_bl = 0

count_crc = 0
tot_count_cr = 0

count_crd = 0


for episode, values in episodes.items():
    for tuple_t in values:

        # BRC 
        if tuple_t[:3] == ('B', 'r', 'C'): count_brc += 1
        # BLA 
        if tuple_t[:3] == ('B', 'l', 'A'): count_bla += 1
        # BlB 
        if tuple_t[:3] == ('B', 'l', 'B'): count_blb += 1
        # CRC 
        if tuple_t[:3] == ('C', 'r', 'C'): count_crc += 1
        # CRD 
        if tuple_t[:3] == ('C', 'r', 'D'): count_crd += 1

        # BR 
        if tuple_t[:2] == ('B', 'r'): tot_count_br += 1
        # BL 
        if tuple_t[:2] == ('B', 'l'): tot_count_bl += 1
        # CR 
        if tuple_t[:2] == ('C', 'r'): tot_count_cr += 1


t_br,t_bl,t_cr= [0 for _ in range(4)],[0 for _ in range(4)],[0 for _ in range(4)]
t_br[2] = (count_brc/tot_count_br)
t_bl[0] = (count_bla/tot_count_bl)
t_bl[1] = (count_blb/tot_count_bl)
t_cr[2] = (count_crc/tot_count_cr)
t_cr[3] = (count_crd/tot_count_cr)

print('Prob of starting in B compute action r and finish in C is: ', count_brc/tot_count_br)
print('Prob of starting in B compute action l and finish in A is: ', count_bla/tot_count_bl)
print('Prob of starting in B compute action l and finish in B is: ', count_blb/tot_count_bl)
print('Prob of starting in C compute action r and finish in C is: ', count_crc/tot_count_cr)
print('Prob of starting in C compute action r and finish in D is: ', count_crd/tot_count_cr)

Prob of starting in B compute action r and finish in C is:  1.0
Prob of starting in B compute action l and finish in A is:  0.5
Prob of starting in B compute action l and finish in B is:  0.5
Prob of starting in C compute action r and finish in C is:  0.3333333333333333
Prob of starting in C compute action r and finish in D is:  0.6666666666666666


### 2.3) Compute V(s) for all non-terminal states by using a direct evaluation approach (i.e., without considering the structure of the bellman equation).  

In [None]:
episodes = {1: [('B', 'r', 'C', -1), ('C', 'r', 'C', -1),('C', 'r', 'D', 1)], 
            2: [('B', 'r', 'C', -1), ('C', 'r', 'D', 1)], 
            3: [('B', 'l', 'A', 5)], 
            4: [('B', 'l', 'B', -1),('B', 'l', 'B', -1),('B', 'l', 'A', 5)]
           }

v = {'A': 5, 'B': 0, 'C': 0, 'D': 1}

count_b = 0
reward_b = 0
count_c = 0
reward_c = 0

for episode, values in episodes.items():
    for tuple_t in values:
            
        if tuple_t[0] == ('C'):
            #c= True  NO NEED 
            reward_c += tuple_t[-1]
            count_c += 1
        else:
            #if tuple_t[0] == ('B'):  A and B are the only non-terminal states so they are alwaays one of the two starting states
            count_b += 1
            reward_b += tuple_t[-1]
            



v['B'] = None 
v['C'] = None  

print(v)

### 2.4) Consider and environment with states {A, B, C, D}, actions {r, l} where states {A, D} are terminal. Consider the following different sequence of learning episodes:
* E1: (B, r, C, −1),(C, r, C, −1),(C, r, D, +1)
* E2: (B, r, C, −1),(C, r, D, +1)
* E3: (B, l, A, +5)
* E4: (B, l, B, −1),(B, l, B, −1),(B, l, A, +5)
### Compute v(s) for all non-terminal states by using a sample-based evaluation approach assuming $\alpha$ = .1 and $\gamma$ = 0.9.

In [102]:
# I just added the formula for the sample based approach in the for for each tuple 

episodes = {1: [('B', 'r', 'C', -1), ('C', 'r', 'C', -1),('C', 'r', 'D', 1)], 
            2: [('B', 'r', 'C', -1), ('C', 'r', 'D', 1)], 
            3: [('B', 'l', 'A', 5)], 
            4: [('B', 'l', 'B', -1),('B', 'l', 'B', -1),('B', 'l', 'A', 5)]}
v = {'A': 5, 'B': 0, 'C': 0, 'D': 1}
alpha = 0.1
gamma = 0.9

for episode, values in episodes.items():
    for tuple_t in values:
       v[tuple_t[0]] = (1-alpha) * v[tuple_t[0]] + alpha*(tuple_t[-1] + gamma*v[tuple_t[2]])
       
        
print(v)

{'A': 5, 'B': 1.4651930390000003, 'C': 0.28, 'D': 1}


## Exercise 3

<img src="images/env3.png" alt="ex1" style="zoom: 30%;" />

Consider the environment displayed in Figure above, where states $(0, 3)$ and $(1, 3)$ are terminal states with reward $+1$ and $−1$ respectively. The agent can move in the four directions. The transition model states that for every state and action the agent has $0.8$ chances of moving in the chosen direction and $0.1$ chances to move in the othogonal directions.The reward model states that for every state, action and successor state the agent pays $−0.01$. Assume that the discount factor is set to $\gamma = 0.9$. Answer the following questions: 

### 3.1) Use one of the methods developed in the lab to compute the optimal policy and the value function. Print the results.

In [None]:
"""

your code here

"""

In [None]:
env = gym.make('VeryBadLavaFloor-v0')
env.render()
print()

policy, values = None, None# your code here
policy_render = np.vectorize(env.actions.get)(policy.reshape(env.rows, env.cols))
print(values)
print(policy_render)

### 3.2) Consider the optimal policy obtained in the previous exercise and focus on states $(2, 1)$. State why in that state the policy selects the "U" action, and what is the $\Delta$  to instead choosing an "L" action  (i.e., $Q((2,1),RIGHT)$ - $Q((2,1),LEFT)$). Motivate your answer with the code.

In [None]:
"""

your code here

"""

### 3.3) Consider now the starting state $S$,  cell $(0,0)$. What is the $\Delta$ between the optimal action and the second-best one? Print both the action and the $\Delta$ value. The code should be parametric, i.e., if we change the initial state, it must return the correct answer.

In [None]:
"""

your code here

"""

### 3.4) Consider the following environment where states $(0, 3)$ and $(1, 3)$ are terminal states with reward $+1$ and $−1$ respectively. The transition model is the same one defined above, however the agent now gets a $+0.5$ for every action perfomed in the environment. Is it possible to get an optimal policy that allows an agent to start from state $(0,0)$ and reach state $(0,3)$? Motivate your answer with the code.

<img src="images/env3.png" alt="ex1" style="zoom: 30%;" />

In [92]:
env = gym.make('NiceLavaFloor-v0')
env.render()
print()

policy, values = #your code here
policy_render = np.vectorize(env.actions.get)(policy.reshape(env.rows, env.cols))
print(values)
print(policy_render)

SyntaxError: invalid syntax (1417855668.py, line 5)

### 3.5) Compute the probability of ending in state (1, 3) if we execute the sequence of actons < Right, Up > from state (0, 2). Motivate your answer reporting the code and the solution printed. 

In [100]:
"""
The probability result is 0.01, I computed it by just cycling 
in all states, from the starting position and calculating 
the probability to go in the right direction.
We cycle 2 times trough states, once for the first action and another time for the second.
We exit from the loop once we reach the desired state with our two actions.

"""
env = gym.make('NiceLavaFloor-v0')

id_start_state = env.pos_to_state(0,2) # start state 
id_end_state = env.pos_to_state(1,3) # end state 
state = id_start_state
actions = {0: "L", 1: "R", 2: "U", 3: "D"}

prob = 0
action_1 = 1
action_2 = 2
probs_fin = 0

for next_state in range(env.observation_space.n):
    if env.T[state, action_1, next_state] == 0:
        continue

    probs = env.T[state, action_1, next_state]
    
 
    for second_next_state in range(env.observation_space.n):
        if env.T[next_state, action_2, second_next_state] == 0:
            continue

        if second_next_state == id_end_state:
    
            probs *= env.T[next_state, action_2, second_next_state]

            print(f'{env.state_to_pos(state)} --> {env.state_to_pos(next_state)} --> {env.state_to_pos(second_next_state)}')
            print(f'probs: {env.T[state, action_1, next_state]} --> {env.T[next_state, action_2, second_next_state]}')
            probs_fin += probs

            print('================')
            break
    
print()
print('Probability: ', round(probs_fin,2))

(0, 2) --> (1, 2) --> (1, 3)
probs: 0.1 --> 0.1

Probability:  0.01
