In [1]:
import gym
import numpy as np
env = gym.make('FrozenLake-v1')

def value_iteration(env, gamma = 1.0):
    value_table = np.zeros(env.nS)
    threshold = 0.000001
    i=0
    while True:
        i = i+1
        updated_value_table = np.copy(value_table)
        for state in range(env.nS):
            Q_value = []
            for action in range(env.nA):
                v = 0
                for trans_prob, next_state, reward, _ in env.P[state][action]:
                    v = v + trans_prob * (reward + gamma * updated_value_table[next_state])
                Q_value.append(v)
                
            value_table[state] = max(Q_value)
            
        if (np.max(np.fabs(updated_value_table - value_table)) <= threshold):
            print ('Value-iteration converged at iteration # %d.' %(i+1))
            break
    return value_table

def extract_policy(value_table, gamma = 1.0):
    policy = np.zeros(env.nS)
    for state in range(env.nS):
        Q_table = np.zeros(env.nA)
        for action in range(env.nA):
            for trans_prob, next_state, reward, _ in env.P[state][action]:
                Q_table[action] = Q_table[action] + (trans_prob * (reward + gamma * value_table[next_state]))
        policy[state]= np.argmax(Q_table)
    return policy

optimal_value_function = value_iteration(env=env, gamma = 1.0)
print(optimal_value_function)
optimal_policy = extract_policy(optimal_value_function, gamma=1.0)
print(optimal_policy)
optimal_policy_int = optimal_policy.astype(np.int)

done = False
nA = env.action_space.n
state = env.reset()
env.render()
while not done:
    action = optimal_policy_int[state]
    new_state, reward, done, info = env.step(action)
    env.render()
    state = new_state

Value-iteration converged at iteration # 432.
[0.82350607 0.82349825 0.82349269 0.82348981 0.82350777 0.
 0.52939508 0.         0.82351104 0.82351566 0.76469375 0.
 0.         0.88234317 0.9411714  0.        ]
[0. 3. 3. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHF