In [44]:
import numpy as np

In [46]:
# MDP for bandit_walk_five : 
# Deterministic environment (100% action success)
# 3 non-terminal states, 2 terminal states
# The only reward (+1) is at the right-most state
# Episodic environment where agent terminates in the 
# left-most or right-most states T-1-2-3-T
# actions left (0), right (1)
bandit_walk = {
    0: {
        0: [(1.0,0,0.0,True)],
        1: [(1.0,0,0.0,True)]
    },
    1: {
        0: [(1.0,0,0.0,True)],
        1: [(1.0,2,0.0,False)]
    },
    2: {
        0: [(1.0,1,0.0,False)],
        1: [(1.0,3,0.0,False)]
    },
    3: {
        0: [(1.0,2,0.0,False)],
        1: [(1.0,4,1.0,True)]
    },
    4: {
        0: [(1.0,4,0.0,True)],
        1: [(1.0,4,0.0,True)]
    }
}

In [47]:
# MDP for slippery_bandit_walk_five : 
# Stochastic environment (80% action success, 20% backwards)
# 3 non-terminal states, 2 terminal states
# The only reward (+1) is at the right-most state
# Episodic environment where agent terminates in the 
# left-most or right-most states T-1-2-3-T
# actions left (0), right (1)
slippery_bandit_walk = {
    0: {
        0: [(1.0,0,0.0,True)],
        1: [(1.0,0,0.0,True)]
    },
    1: {
        0: [(0.8,0,0.0,True),(0.2,2,0.0,False)],
        1: [(0.8,2,0.0,False),(0.2,0,0.0,True)]
    },
    2: {
        0: [(0.8,1,0.0,False),(0.2,3,0.0,False)],
        1: [(0.8,3,0.0,False),(0.2,1,0.0,False)]
    },
    3: {
        0: [(0.8,2,0.0,False),(0.2,4,1.0,True)],
        1: [(0.8,4,1.0,True),(0.2,2,0.0,False)]
    },
    4: {
        0: [(1.0,4,0.0,True)],
        1: [(1.0,4,0.0,True)]
    }
}

In [48]:
print(bandit_walk)

{0: {0: [(1.0, 0, 0.0, True)], 1: [(1.0, 0, 0.0, True)]}, 1: {0: [(1.0, 0, 0.0, True)], 1: [(1.0, 2, 0.0, False)]}, 2: {0: [(1.0, 1, 0.0, False)], 1: [(1.0, 3, 0.0, False)]}, 3: {0: [(1.0, 2, 0.0, False)], 1: [(1.0, 4, 1.0, True)]}, 4: {0: [(1.0, 4, 0.0, True)], 1: [(1.0, 4, 0.0, True)]}}


In [49]:
print(slippery_bandit_walk)

{0: {0: [(1.0, 0, 0.0, True)], 1: [(1.0, 0, 0.0, True)]}, 1: {0: [(0.8, 0, 0.0, True), (0.2, 2, 0.0, False)], 1: [(0.8, 2, 0.0, False), (0.2, 0, 0.0, True)]}, 2: {0: [(0.8, 1, 0.0, False), (0.2, 3, 0.0, False)], 1: [(0.8, 3, 0.0, False), (0.2, 1, 0.0, False)]}, 3: {0: [(0.8, 2, 0.0, False), (0.2, 4, 1.0, True)], 1: [(0.8, 4, 1.0, True), (0.2, 2, 0.0, False)]}, 4: {0: [(1.0, 4, 0.0, True)], 1: [(1.0, 4, 0.0, True)]}}


In [50]:
def policy_evaluation(pi, P, gamma=1.0, theta=1e-10):
    state_space = len(P)
    prev_state_values = np.zeros(state_space)
    
    while True:
        current_state_values = np.zeros(state_space)
        for state in range(state_space):
            for prob, next_state, reward, done in P[state][pi[state]]:
                current_state_values[state] += prob * (reward + gamma * prev_state_values[next_state] * (not done))
        if np.max(np.abs(prev_state_values - current_state_values)) < theta:
            break
        prev_state_values = current_state_values.copy()
    return current_state_values
    

In [62]:
policy_always_right = {0:1,1:1,2:1,3:1,4:1}
policy_always_left = {0:0,1:0,2:0,3:0,4:0}

In [63]:
state_values_always_right = policy_evaluation(policy_always_right, bandit_walk, gamma=0.99)
state_values_always_left = policy_evaluation(policy_always_left, bandit_walk, gamma=0.99)

In [64]:
print(f"ALWAYS RIGHT : {state_values_always_right}")
print(f"ALWAYS LEFT : {state_values_always_left}")

ALWAYS RIGHT : [0.     0.9801 0.99   1.     0.    ]
ALWAYS LEFT : [0. 0. 0. 0. 0.]


In [65]:
state_values_always_right = policy_evaluation(policy_always_right, slippery_bandit_walk, gamma=0.99)
state_values_always_left = policy_evaluation(policy_always_left, slippery_bandit_walk, gamma=0.99)

In [66]:
print(f"ALWAYS RIGHT : {state_values_always_right}")
print(f"ALWAYS LEFT : {state_values_always_left}")

ALWAYS RIGHT : [0.         0.73111101 0.92311996 0.98277775 0.        ]
ALWAYS LEFT : [0.         0.01142361 0.057695   0.24569444 0.        ]
