In [24]:
import numpy as np
from collections import defaultdict

# ==========================================
# 1. GridWorld 환경 (step, reset 함수 추가됨)
# ==========================================
class gridworld:
    def __init__(self):
        self.action_space = [0, 1, 2, 3]
        self.action_meaning = {0: 'up', 1: 'down', 2: 'left', 3: 'right'}
        self.reward_map = np.array([[0, 0, 0, 1.0],
                                    [0, None, 0, -1.0],
                                    [0, 0, 0, 0]])
        self.goal_state = (0, 3)
        self.wall_state = (1, 1)
        self.start_state = (2, 0)
        self.agent_state = self.start_state

    def height(self): return len(self.reward_map)
    def width(self): return len(self.reward_map[0])
    def actions(self): return self.action_space
    
    
    def reset(self):
        self.agent_state = self.start_state
        return self.agent_state

    
    def step(self, action):
        state = self.agent_state
        next_state = self.next_state(state, action)
        reward = self.reward(state, action, next_state)
        done = (next_state == self.goal_state)
        
        self.agent_state = next_state
        return next_state, reward, done

    def next_state(self, state, action):
        action_move_map = [(-1, 0), (1, 0), (0, -1), (0, 1)]
        move = action_move_map[action]
        next_state = (state[0] + move[0], state[1] + move[1])
        ny, nx = next_state
        
        if nx < 0 or nx >= self.width() or ny < 0 or ny >= self.height():
            next_state = state
        elif next_state == self.wall_state:
            next_state = state
            
        return next_state

    def reward(self, state, action, next_state):
        return self.reward_map[next_state[0], next_state[1]]

    def render_v(self, V):
        for h in range(self.height()):
            for w in range(self.width()):
                state = (h, w)
                if state == self.wall_state:
                    print("  ###  ", end="")
                else:
                    print(f"{V[state]:6.2f}", end=" ")
            print()
        print()

# ==========================================
# 2. TD Agent (들여쓰기 수정됨)
# ==========================================
class TDagent:
    def __init__(self):
        self.gamma = 0.9
        self.alpha = 0.01
        self.action_size = 4
        random_actions = {0: 0.25, 1: 0.25, 2: 0.25, 3: 0.25}
        self.pi = defaultdict(lambda: random_actions)
        self.V = defaultdict(lambda: 0)

    def get_action(self, state):
        action_probs = self.pi[state]
        actions = list(action_probs.keys())
        probs = list(action_probs.values())
        return np.random.choice(actions, p=probs)

    def eval(self, state, reward, next_state, done):
        next_V = 0 if done else self.V[next_state]
        target = reward + self.gamma * next_V
        self.V[state] += (target - self.V[state]) * self.alpha

# ==========================================
# 3. 실행부
# ==========================================
env = gridworld()
agent = TDagent()

episodes = 1000
for episode in range(episodes):
    state = env.reset()
    
    while True:
        action = agent.get_action(state)
        next_state, reward, done = env.step(action)
        
        agent.eval(state, reward, next_state, done)
        
        if done:
            break
        state = next_state 

print("=== TD Prediction Result ===")
env.render_v(agent.V)

=== TD Prediction Result ===
  0.01   0.07   0.19   0.00 
 -0.04   ###   -0.50  -0.24 
 -0.10  -0.23  -0.44  -0.77 

