# EVALUATION ET AMÉLIORATION D’UNE POLITIQUE

In [1]:
import forTp2 as forTp2

In [31]:
import numpy as np


GRID_SIZE = 5
TERMINAL_STATES = [0, GRID_SIZE*GRID_SIZE-1]
states = np.arange(GRID_SIZE*GRID_SIZE)
actions = ['UP', 'RIGHT', 'DOWN', 'LEFT']
discount = 1.

def next_state(grid_size, state, action):
    i,j = np.unravel_index(state, (grid_size, grid_size))
    if action == 'UP':
        i = np.maximum(0,i-1)
    elif action == 'DOWN':
        i = np.minimum(i+1,grid_size-1)
    elif action == 'RIGHT':
        j = np.minimum(j+1,grid_size-1)
    elif action == 'LEFT':
        j = np.maximum(0,j-1)    
    new_state = np.ravel_multi_index((i,j), (grid_size, grid_size))
    return new_state
    
def is_done(state, terminal_states):
    return state in terminal_states

# The unifom policy            
uniform_policy = {s : { a : 1/len(actions) for a in actions } for s in states}


# Transition is coded as a dictionary of dictionary
P = {}
for s in range(len(states)):
    P[s] = {a : () for a in actions}
    if s in TERMINAL_STATES:
        # if terminal state, stay where you are
        # instead of next_state
        reward = 0.
        for action in actions:
            P[s][action] = (s, reward, True)
    else:
        # transition
        reward = -1.
        for action in actions:
            next_s = next_state(GRID_SIZE, s, action)
            P[s][action] = (next_s,reward,is_done(next_s, TERMINAL_STATES))

In [68]:
def policy_evaluation_1(policy, P, discount=1.0, theta=1e-6, max_iterations=300):
    
    state_len = len(P.keys())
    actions = ['UP', 'RIGHT', 'DOWN', 'LEFT']
    action_len = len(actions)
    states = np.arange(GRID_SIZE*GRID_SIZE)

    # Number of evaluation iterations
    evaluation_iterations = 1
    # Initialize a value function for each state as zero
    V = np.zeros(state_len)
    # Repeat until change in value is below the threshold
    for i in range(int(max_iterations)):
        # Initialize a change of value function as zero
        delta = 0
        # Iterate though each state
        for state in range(state_len):
            # Initial a new value of current state
            v = 0
            # Try all possible actions which can be taken from this state
            for action_state, action in enumerate(policy[state]):
                action_probability = policy[state][action]
                # Check how good next state will be
                #print(P[state][action])
                #for next_state, reward, terminated in P[state][action]:
                    # Calculate the expected value
                for action in actions:
                    next_state, reward, done = P[state][action]
                    v += uniform_policy[s][action] * action_probability * (reward + discount * V[next_state])

            # Calculate the absolute change of value function
            delta = max(delta, np.abs(V[state] - v))
            # Update value function
            V[state] = v
            
        evaluation_iterations += 1

        # Terminate if value change is insignificant
        if delta < theta:
            print(f'Policy evaluated in {evaluation_iterations} iterations.')
            return V

In [69]:
VV = policy_evaluation_1(uniform_policy, P)
VV

In [50]:
def one_step_lookahead(P, state, V, discount_factor):
        action_values = np.zeros(4)
        probability = 0.25
        actions = ['UP', 'RIGHT', 'DOWN', 'LEFT']
        uniform_policy = {s : { a : 1/len(actions) for a in actions } for s in states}
        
        for s in range(len(P.keys())):
            # we will get new state value
            new_s = 0
            # for all actions
            for a in range(len(actions)):
                # for all transitions from currect state
                for action in actions:
                    #print("i->", i)
                    #print("uniform_policy ->", uniform_policy)
                    next_state, reward, done = P[s][action]
                    #print(action_values[a])
                    action_values[a] += uniform_policy[s][action] * (reward + discount_factor * V[next_state])
        return action_values

In [115]:
def policy_iteration(P, discount_factor=1.0, max_iterations=300):
        
        state_len = len(P.keys())
        action_len = 4 # up right down left
        states = np.arange(GRID_SIZE*GRID_SIZE)
        actions = ['UP', 'RIGHT', 'DOWN', 'LEFT']
        transition_prob = 1/4
        
        # Start with a random policy
        #num states x num actions / num actions
        #policy = np.ones([state_len, action_len]) / action_len
        policy = {s : { a : 1/len(actions) for a in actions } for s in states}
        #print(policy)
        # Initialize counter of evaluated policies
        evaluated_policies = 1
        # Repeat until convergence or critical number of iterations reached
        for i in range(int(max_iterations)):
            stable_policy = True
            # Evaluate current policy
            #V = policy_evaluation(policy, environment, discount_factor=discount_factor)
            V = policy_evaluation_1(policy, P,discount=1.0, theta=1e-6, max_iterations=500)
            # Go through each state and try to improve actions that were taken (policy Improvement)
            for state in range(state_len):
                # Choose the best action in a current state under current policy
                #print(policy[state])
                current_action = np.argmax(policy[state])
                # Look one step ahead and evaluate if current action is optimal
                # We will try every possible action in a current state
                action_value = one_step_lookahead(P, state, V, discount_factor)
                # Select a better action
                best_action = np.argmax(action_value)
                # If action didn't change
                print(current_action)
                if current_action != best_action:
                    #stable_policy = True
                    # Greedy policy update
                    print('test')
                    policy[state] = np.eye(action_len)[best_action]

            evaluated_policies += 1
            # If the algorithm converged and policy is not changing anymore, then return final policy and value function
            if stable_policy:
                print(f'Evaluated {evaluated_policies} policies.')
                return policy, V

In [116]:
policcc, vvvv = policy_iteration(P)

Policy evaluated in 306 iterations.
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
Evaluated 2 policies.


In [113]:
vvvv

array([  0.        , -22.99998915, -34.33331698, -39.6666477 ,
       -41.66664672, -22.99998915, -30.66665249, -36.33331665,
       -38.99998218, -39.66664859, -34.33331698, -36.33331665,
       -37.33331665, -36.33331743, -34.33331848, -39.6666477 ,
       -38.99998218, -36.33331743, -30.66665379, -22.9999906 ,
       -41.66664672, -39.66664859, -34.33331848, -22.9999906 ,
         0.        ])

In [114]:
policcc

{0: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 1: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 2: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 3: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 4: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 5: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 6: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 7: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 8: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 9: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 10: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 11: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 12: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 13: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 14: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 15: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 16: {'UP': 0.25, 

## PART 1- POLICY EVALUATION

In [243]:
def policy_evaluation(P, state_values, GRID_SIZE ,gamma = 1 ,theta=0.0000001):
    
    delta = 1
    state_len = len(P.keys())
    action_len = 4 # up right down left
    states = np.arange(GRID_SIZE*GRID_SIZE)
    actions = ['UP', 'RIGHT', 'DOWN', 'LEFT']
    transition_prob = 1/4
    
    # The unifom policy            
    uniform_policy = {s : { a : 1/len(actions) for a in actions } for s in states}
    
    while (delta > theta):
        delta = 0
        # for all state
        for s in range(state_len):
            # we will get new state value
            new_s = 0
            # for all actions
            for a in range(action_len):
                # get the current transitions list (U,D,L,R)
                transitions_list = P[s][actions[a]]
                # print("transitions_list - >", transitions_list)
                # for all transitions from currect state
                for i in actions:
                    #print("i->", i)
                    #print("uniform_policy ->", uniform_policy)
                    #print(P[s][i])
                    next_state, reward, done = P[s][i]
                    new_s += uniform_policy[s][i]*transition_prob*(reward+gamma*state_values[next_state])
        
            delta = max(delta, np.abs(new_s - state_values[s])) 
            state_values[s] = new_s
    return state_values



In [244]:
state_values = [-1 for i in range(GRID_SIZE*GRID_SIZE)]
state_values[0] = 0
state_values[GRID_SIZE*GRID_SIZE - 1] = 0
state_values_n = policy_evaluation(P,state_values, GRID_SIZE)

print(state_values_n)

[0.0, -13.999999353455285, -19.999999070505464, -21.999998976918956, -13.999999353455287, -17.9999992066048, -19.99999913855513, -19.999999148405234, -19.999999070505467, -19.99999913855513, -17.999999273098258, -13.99999945728622, -21.99999897691896, -19.999999148405237, -13.99999945728622, 0.0]


In [245]:
def one_step_lookahead(P, state, V, discount_factor):
        action_values = np.zeros(4)
        probability = 0.25
        actions = ['UP', 'RIGHT', 'DOWN', 'LEFT']
        uniform_policy = {s : { a : 1/len(actions) for a in actions } for s in states}
        
        for s in range(len(P.keys())):
            # we will get new state value
            new_s = 0
            # for all actions
            for a in range(len(actions)):
                # for all transitions from currect state
                for action in actions:
                    #print("i->", i)
                    #print("uniform_policy ->", uniform_policy)
                    next_state, reward, done = P[s][action]
                    print(action_values[a])
                    action_values[a] += uniform_policy[s][action] * (reward + discount_factor * V[next_state])
        return action_values

In [None]:
one_step_lookahead(P,1,V,0.1)

# PART 2 - POLICY IMPROVEMENT 

In [413]:
def policy_iteration(P, discount_factor=1.0, max_iterations=1e9):
        
        state_len = len(P.keys())
        action_len = 4 # up right down left
        states = np.arange(GRID_SIZE*GRID_SIZE)
        actions = ['UP', 'RIGHT', 'DOWN', 'LEFT']
        transition_prob = 1/4
        
        # Start with a random policy
        #num states x num actions / num actions
        #policy = np.ones([state_len, action_len]) / action_len
        policy = {s : { a : 1/len(actions) for a in actions } for s in states}
        #print(policy)
        # Initialize counter of evaluated policies
        evaluated_policies = 1
        # Repeat until convergence or critical number of iterations reached
        for i in range(int(max_iterations)):
                stable_policy = True
                # Evaluate current policy
                #V = policy_evaluation(policy, environment, discount_factor=discount_factor)
                V = policy_evaluation_1(P, state_values, GRID_SIZE)
                # Go through each state and try to improve actions that were taken (policy Improvement)
                for state in range(state_len):
                        # Choose the best action in a current state under current policy
                        current_action = np.argmax(policy[state])
                        # Look one step ahead and evaluate if current action is optimal
                        # We will try every possible action in a current state
                        action_value = one_step_lookahead(P, state, V, discount_factor)
                        # Select a better action
                        best_action = np.argmax(action_value)
                        # If action didn't change
                        if current_action != best_action:
                                stable_policy = True
                                # Greedy policy update
                                policy[state] = np.eye(action_len)[best_action]
                evaluated_policies += 1
                # If the algorithm converged and policy is not changing anymore, then return final policy and value function
                if stable_policy:
                        print(f'Evaluated {evaluated_policies} policies.')
                        return policy, V

In [265]:
policy_out, Va = policy_iteration(P)

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-3.7499999044005428
-8.99999976696353
-13.749999649650453
0.0
-3.7499999044005428
-8.99999976696353
-13.749999649650453
0.0
-3.7499999044005428
-8.99999976696353
-13.749999649650453
0.0
-3.7499999044005428
-8.99999976696353
-13.749999649650453
-13.999999649650453
-19.249999512213442
-24.99999936093853
-30.249999233563486
-13.999999649650453
-19.249999512213442
-24.99999936093853
-30.249999233563486
-13.999999649650453
-19.249999512213442
-24.99999936093853
-30.249999233563486
-13.999999649650453
-19.249999512213442
-24.99999936093853
-30.249999233563486
-33.999999137964025
-39.749998986689114
-45.4999988354142
-50.749998709495614
-33.999999137964025
-39.749998986689114
-45.4999988354142
-50.749998709495614
-33.999999137964025
-39.749998986689114
-45.4999988354142
-50.749998709495614
-33.999999137964025
-39.749998986689114
-45.4999988354142
-50.749998709495614
-55.9999985720586
-56.2499985720586
-60.99999845474552
-66.2

In [266]:
policy

{0: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 1: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 2: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 3: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 4: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 5: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 6: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 7: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 8: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 9: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 10: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 11: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 12: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 13: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 14: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25},
 15: {'UP': 0.25, 'RIGHT': 0.25, 'DOWN': 0.25, 'LEFT': 0.25}}

In [39]:
def policy_improvement(P, state_values, GRID_SIZE, policy_eval_fn=policy_evaluation):
    # Initiallize a policy arbitarily
    actions = ['UP', 'RIGHT', 'DOWN', 'LEFT']
    policy = np.zeros([len(P.keys()), 4]) / 4
    
    while True:
        # Compute the Value Function for the current policy
        V = policy_evaluation(P, state_values, GRID_SIZE)
        
        
        
        # Will be set to false if we update the policy
        policy_stable = True
        
        # Improve the policy at each state
        for s in range(len(P.keys())):
            # The best action we would take under the currect policy
            chosen_a = np.argmax(policy[s])
            # Find the best action by one-step lookahead
            action_values = np.zeros(4)
            for a in range(4):
                for i in actions:
                    next_state, reward, done = P[s][i]
                    action_values[a] += 0.25 * (reward + 1 * V[next_state])
            
            best_a = np.argmax(action_values)
            
            # Greedily (max in the above line) update the policy
            if chosen_a != best_a:
                policy_stable = False
            policy[s] = np.eye(4)[best_a]
        
        # Until we've found an optimal policy. Return it
        if policy_stable:
            return policy, V

In [40]:
state_values = [-1 for i in range(GRID_SIZE*GRID_SIZE)]
state_values[0] = 0
state_values[(GRID_SIZE*GRID_SIZE) - 1] = 0
policy_imp, state_values_imp = policy_improvement(P,state_values, GRID_SIZE)

print(policy_imp, state_values_imp)

[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]] [0.0, -13.999999353455285, -19.999999070505464, -21.999998976918956, -13.999999353455287, -17.9999992066048, -19.99999913855513, -19.999999148405234, -19.999999070505467, -19.99999913855513, -17.999999273098258, -13.99999945728622, -21.99999897691896, -19.999999148405237, -13.99999945728622, 0.0]


In [159]:
# implement the algorithm
def value_iteration(P, gamma = 2.0):
    actions = ['UP', 'RIGHT', 'DOWN', 'LEFT']
    # initialize value table with zeros
    state_len = len(P.keys())
    value_table = np.zeros(state_len)
    

    # set number of iterations and threshold
    no_of_iterations = 10
    threshold = 1e-20

    for i in range(no_of_iterations):
        updated_value_table = np.copy(value_table)

        # now we calculate Q value for each actions in the state
        # and update the value of state with maximum Q value
        for s in range(state_len):
            Q_value = []
            for action in range(len(actions)):
                next_states_rewards = []
                for a in actions:
                    next_state, reward_prob, done = P[s][a]
                    print("next_state -> ",next_state, " i->", i, " s->", s )
                    print("reward_prob -> ", reward_prob)
                    next_states_rewards.append((done * (reward_prob + gamma * updated_value_table[next_state])))

                Q_value.append(np.sum(next_states_rewards))

            value_table[s] = max(Q_value)
            print("value_table ->", value_table)

        # we will check whether we have reached the convergence i.e whether the difference
        # between our value table and updated value table is very small. But how do we know it is very
        # small? We set some threshold and then we will see if the difference between value table and
        # updated value table is less than our threshold. If yes we break the loop else we continue.
        if (np.sum(np.fabs(updated_value_table - value_table)) <= threshold):
            print ('Value-iteration converged at iteration# %d.' %(i+1))
            break

    return value_table



In [160]:
optimal_value_function = value_iteration(P, gamma=1.0)

next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
next_state ->  0  i-> 0  s-> 0
reward_prob ->  0.0
value_table -> [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
next_state ->  1  i-> 0  s-> 1
reward_prob ->  -1.0
next_state ->  2  i-> 0  s-> 1
reward_prob ->  -1.0
next_state ->  

In [156]:
optimal_value_function

array([ 0., -1.,  0.,  0., -1.,  0.,  0.,  0.,  0.,  0.,  0., -1.,  0.,
        0., -1.,  0.])

In [89]:
# code policy value iteration algorithm for reinforcement learning in python from scratch

import numpy as np
import gym
import time

np.random.seed(123)

# create the environment
env = gym.make('FrozenLake-v1')

# implement the algorithm
def value_iteration(env, gamma = 1.0):
    # initialize value table with zeros
    value_table = np.zeros(env.observation_space.n)

    # set number of iterations and threshold
    no_of_iterations = 100000
    threshold = 1e-20

    for i in range(no_of_iterations):
        updated_value_table = np.copy(value_table)

        # now we calculate Q value for each actions in the state
        # and update the value of state with maximum Q value
        for state in range(env.observation_space.n):
            Q_value = []
            for action in range(env.action_space.n):
                next_states_rewards = []
                for next_sr in env.P[state][action]:
                    trans_prob, next_state, reward_prob, _ = next_sr
                    next_states_rewards.append((trans_prob * (reward_prob + gamma * updated_value_table[next_state])))

                Q_value.append(np.sum(next_states_rewards))

            value_table[state] = max(Q_value)

        # we will check whether we have reached the convergence i.e whether the difference
        # between our value table and updated value table is very small. But how do we know it is very
        # small? We set some threshold and then we will see if the difference between value table and
        # updated value table is less than our threshold. If yes we break the loop else we continue.
        if (np.sum(np.fabs(updated_value_table - value_table)) <= threshold):
            print ('Value-iteration converged at iteration# %d.' %(i+1))
            break

    return value_table

def extract_policy(value_table, gamma = 1.0):
    # initialize the policy with zeros
    policy = np.zeros(env.observation_space.n)

    for state in range(env.observation_space.n):
        # initialize the Q table for a state
        Q_table = np.zeros(env.action_space.n)

        # compute Q value for all ations in the state
        for action in range(env.action_space.n):
            for next_sr in env.P[state][action]:
                trans_prob, next_state, reward_prob, _ = next_sr
                Q_table[action] += (trans_prob * (reward_prob + gamma * value_table[next_state]))

        # select the action which has maximum Q value as an optimal action of the state
        policy[state] = np.argmax(Q_table)

    return policy

def main():
    optimal_value_function = value_iteration(env=env, gamma=1.0)
    print(optimal_value_function)
    optimal_policy = extract_policy(optimal_value_function, gamma=1.0)
    print(optimal_policy)

    # play the game with optimal policy
    state = env.reset()
    env.render()
    while True:
        action = int(optimal_policy[state])
        state, reward, done, info = env.step(action)
        env.render()
        time.sleep(0.5)
        if done:
            break

if __name__ == '__main__':
    main()

Value-iteration converged at iteration# 1373.
[0.82352941 0.82352941 0.82352941 0.82352941 0.82352941 0.
 0.52941176 0.         0.82352941 0.82352941 0.76470588 0.
 0.         0.88235294 0.94117647 0.        ]
[0. 3. 3. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [406]:
def policy_evaluation_1(policy, P, discount=1.0, theta=1e-6, max_iterations=1000):
    
    state_len = len(P.keys())
    actions = ['UP', 'RIGHT', 'DOWN', 'LEFT']
    action_len = len(actions)
    states = np.arange(GRID_SIZE*GRID_SIZE)

    # Number of evaluation iterations
    evaluation_iterations = 1
    # Initialize a value function for each state as zero
    V = np.zeros(state_len)
    # Repeat until change in value is below the threshold
    for i in range(int(max_iterations)):
        # Initialize a change of value function as zero
        delta = 0
        # Iterate though each state
        for state in range(state_len):
            # Initial a new value of current state
            v = 0
            # Try all possible actions which can be taken from this state
            for action_state, action in enumerate(policy[state]):
                action_probability = policy[state][action]
                # Check how good next state will be
                #print(P[state][action])
                #for next_state, reward, terminated in P[state][action]:
                    # Calculate the expected value
                for action in actions:
                    next_state, reward, done = P[state][action]
                    v += uniform_policy[s][action] * action_probability * (reward + discount * V[next_state])

            # Calculate the absolute change of value function
            delta = max(delta, np.abs(V[state] - v))
            # Update value function
            V[state] = v
            
        evaluation_iterations += 1

        # Terminate if value change is insignificant
        if delta < theta:
            print(f'Policy evaluated in {evaluation_iterations} iterations.')
            return V

In [407]:
VV = policy_evaluation_1(uniform_policy, P)

Policy evaluated in 168 iterations.


In [408]:
VV

array([  0.        , -13.99999335, -19.99999044, -21.99998948,
       -13.99999335, -17.99999184, -19.99999114, -19.99999125,
       -19.99999044, -19.99999114, -17.99999253, -13.99999442,
       -21.99998948, -19.99999125, -13.99999442,   0.        ])

In [409]:
def one_step_lookahead(P, state, V, discount_factor):
        action_values = np.zeros(4)
        probability = 0.25
        actions = ['UP', 'RIGHT', 'DOWN', 'LEFT']
        uniform_policy = {s : { a : 1/len(actions) for a in actions } for s in states}
        
        for s in range(len(P.keys())):
            # we will get new state value
            new_s = 0
            # for all actions
            for a in range(len(actions)):
                # for all transitions from currect state
                for action in actions:
                    #print("i->", i)
                    #print("uniform_policy ->", uniform_policy)
                    next_state, reward, done = P[s][action]
                    print(action_values[a])
                    action_values[a] += uniform_policy[s][action] * (reward + discount_factor * V[next_state])
        return action_values

In [410]:
def policy_iteration(P, discount_factor=1.0, max_iterations=1e9):
        
        state_len = len(P.keys())
        action_len = 4 # up right down left
        states = np.arange(GRID_SIZE*GRID_SIZE)
        actions = ['UP', 'RIGHT', 'DOWN', 'LEFT']
        transition_prob = 1/4
        
        # Start with a random policy
        #num states x num actions / num actions
        #policy = np.ones([state_len, action_len]) / action_len
        policy = {s : { a : 1/len(actions) for a in actions } for s in states}
        #print(policy)
        # Initialize counter of evaluated policies
        evaluated_policies = 1
        # Repeat until convergence or critical number of iterations reached
        for i in range(int(max_iterations)):
                stable_policy = True
                # Evaluate current policy
                #V = policy_evaluation(policy, environment, discount_factor=discount_factor)
                V = policy_evaluation(P, state_values, GRID_SIZE)
                # Go through each state and try to improve actions that were taken (policy Improvement)
                for state in range(state_len):
                        # Choose the best action in a current state under current policy
                        current_action = np.argmax(policy[state])
                        # Look one step ahead and evaluate if current action is optimal
                        # We will try every possible action in a current state
                        action_value = one_step_lookahead(P, state, V, discount_factor)
                        # Select a better action
                        best_action = np.argmax(action_value)
                        # If action didn't change
                        if current_action != best_action:
                                stable_policy = True
                                # Greedy policy update
                                policy[state] = np.eye(action_len)[best_action]
                evaluated_policies += 1
                # If the algorithm converged and policy is not changing anymore, then return final policy and value function
                if stable_policy:
                        print(f'Evaluated {evaluated_policies} policies.')
                        return policy, V

In [414]:
policcc, vvvv = policy_iteration(P)

TypeError: 'float' object is not subscriptable