3-5 FrozenLake에서 가치 반복 (스토케스틱 환경)

In [3]:
import gymnasium as gym
import numpy as np

In [None]:
def value_iteration(env, gamma=0.9):
    # Initialize value function and policy arrays
    V = np.zeros(env.observation_space.n)  # Value function for each state
    pi = np.zeros(env.observation_space.n, dtype=int)  # Policy for each state

    # Value Iteration
    while True:
        delta = 0  # Track maximum change in value
        for state in range(env.observation_space.n):  # For each state
            q = np.zeros(env.action_space.n)  # Action values for current state
            for action in range(env.action_space.n):
                # Get transition probability, next state, reward, and terminal flag
                state_prob, next_state, reward, done = env.P[state][action][0]
                # Calculate Q-value using Bellman equation
                q[action] = reward + gamma * V[next_state]
            v = np.max(q)  # Take maximum Q-value as new state value
            delta = max(delta, abs(V[state] - v))  # Update maximum change
            V[state] = v  # Update value function
        # Stop when value function converges
        if delta < 1e-8:
            break

    # Policy Extraction
    pi = np.zeros(env.observation_space.n, dtype=int)  # Initialize policy array
    for state in range(env.observation_space.n):
        q = np.zeros(env.action_space.n)  # Action values for current state
        for action in range(env.action_space.n):
            # Get transition probability, next state, reward, and terminal flag
            state_prob, next_state, reward, done = env.P[state][action][0]
            # Calculate Q-value for each action
            q[action] = reward + gamma * V[next_state]
        pi[state] = np.argmax(q)  # Choose action with maximum Q-value
    return V, pi

env = gym.make('FrozenLake-v1', render_mode='ansi', is_slippery=False)
env = gym.make('FrozenLake-v1', render_mode='ansi', is_slippery=False)
V, pi = value_iteration(env)
print('최적 정책\n', pi.astype(int).reshape(4,4))
print('최적 가치함수\n', np.round(V.reshape(4,4),4))


최적 정책
 [[1 2 1 0]
 [1 0 1 0]
 [2 1 1 0]
 [0 2 2 0]]
최적 가치함수
 [[0.5905 0.6561 0.729  0.6561]
 [0.6561 0.     0.81   0.    ]
 [0.729  0.81   0.9    0.    ]
 [0.     0.9    1.     0.    ]]
