벨만 방정식을 이용해 랜덤 정책을 평가 - FrozenLake 과업

In [6]:
import gymnasium as gym
import numpy as np

In [10]:
def policy_evaluation(env, policy, gamma=0.9, theta=1e-8):
    V = np.zeros(env.observation_space.n)  # Initialize value function to zeros
    while True:
        delta = 0  # Track maximum change in value
        for state in range(env.observation_space.n):  # For each state
            v = 0  # Initialize value for current state
            # For each possible action and its probability under the policy
            for action, action_prob in enumerate(policy[state]):
                # For each possible outcome of taking that action
                for state_prob, next_state, reward, done in env.P[state][action]:
                    # Update value using Bellman equation
                    v += action_prob * state_prob * (reward + gamma * V[next_state])
            # Update maximum change
            delta = max(delta, abs(v - V[state]))
            V[state] = v  # Update value function
        # Stop when value function changes very little
        if delta < theta:
            break
    return V

env = gym.make('FrozenLake-v1', render_mode='ansi', is_slippery=False)
pi = np.ones([env.observation_space.n, env.action_space.n]) / env.action_space.n
V = policy_evaluation(env, pi)
print('랜덤정책의 가치함수\n', np.round(V.reshape(4,4),4))


랜덤정책의 가치함수
 [[0.0045 0.0042 0.0101 0.0041]
 [0.0067 0.     0.0263 0.    ]
 [0.0187 0.0576 0.107  0.    ]
 [0.     0.1304 0.3915 0.    ]]
