3-5 FrozenLake에서 가치 반복 (스토케스틱 환경, 전이확률 수정)

In [1]:
import gymnasium as gym
import numpy as np

In [2]:
def value_iteration(env, gamma=0.9):
    # Initialize value function and policy arrays
    V = np.zeros(env.observation_space.n)  # Value function for each state
    pi = np.zeros(env.observation_space.n, dtype=int)  # Policy for each state

    # Value Iteration
    while True:
        delta = 0  # Track maximum change in value
        for state in range(env.observation_space.n):  # For each state
            q = np.zeros(env.action_space.n)  # Action values for current state
            for action in range(env.action_space.n):
                # Get transition probability, next state, reward, and terminal flag
                for state_prob, next_state, reward, done in env.P[state][action]:
                    # Calculate Q-value using Bellman equation
                    q[action] += state_prob * (reward + gamma * V[next_state])
            v = np.max(q)  # Take maximum Q-value as new state value
            delta = max(delta, abs(V[state] - v))  # Update maximum change
            V[state] = v  # Update value function
        # Stop when value function converges
        if delta < 1e-8:
            break

    # Policy Extraction
    pi = np.zeros(env.observation_space.n, dtype=int)  # Initialize policy array
    for state in range(env.observation_space.n):
        q = np.zeros(env.action_space.n)  # Action values for current state
        for action in range(env.action_space.n):
            # Get transition probability, next state, reward, and terminal flag
            for state_prob, next_state, reward, done in env.P[state][action]:
            # Calculate Q-value for each action
                q[action] += state_prob * (reward + gamma * V[next_state])
        pi[state] = np.argmax(q)  # Choose action with maximum Q-value
    return V, pi

env = gym.make('FrozenLake-v1', render_mode='ansi', is_slippery=True)

prob = [0.1,0.8,0.1]
for state in range(env.observation_space.n):
    for action in range(env.action_space.n):
        if len(env.P[state][action]) == 3:
            for i, trans in enumerate(env.P[state][action]):
                env.P[state][action][i] = (prob[i], trans[1], trans[2], trans[3])
                
V, pi = value_iteration(env)
print('최적 정책\n', pi.astype(int).reshape(4,4))
print('최적 가치함수\n', np.round(V.reshape(4,4),4))


최적 정책
 [[1 2 1 0]
 [1 0 1 0]
 [2 1 1 0]
 [0 2 2 0]]
최적 가치함수
 [[0.3804 0.3589 0.4536 0.3589]
 [0.436  0.     0.5403 0.    ]
 [0.551  0.7108 0.7504 0.    ]
 [0.     0.8246 0.9533 0.    ]]
