In [1]:
import numpy as np

# Grid world dimensions
grid_height = 4
grid_width = 5
# Possible actions (Up, Down, Left, Right)
actions = ['U', 'D', 'L', 'R']
num_actions = len(actions)

In [2]:
# Initialize state-value function (Critic) and policy (Actor)
value_table = np.zeros((grid_height, grid_width))
policy_table = np.full((grid_height, grid_width), 1 / num_actions)

# Rewards
rewards = np.zeros((grid_height, grid_width))
rewards[3, 4] = 1  # Goal state reward
rewards[2, 2] = -1  # Pitfall state penalty

# Learning parameters
alpha = 0.1  # learning rate
gamma = 0.9  # discount factor
theta = 0.1  # small value for convergence check


In [3]:
# Define state transition function
def step(state, action):
    x, y = state
    if action == 'U':
        x = max(0, x - 1)
    elif action == 'D':
        x = min(grid_height - 1, x + 1)
    elif action == 'L':
        y = max(0, y - 1)
    elif action == 'R':
        y = min(grid_width - 1, y + 1)
    return (x, y)

# Select action using policy (Actor)
def select_action(state):
    policy = policy_table[state[0], state[1]]
    return np.random.choice(actions, p=[policy] * num_actions)

In [4]:
# Main Actor-Critic algorithm
def actor_critic(num_iterations=1000):
    for iteration in range(num_iterations):
        state = (0, 0)  # Start at top-left corner
        path_taken = [state]

        for step_count in range(100):  # Max steps per episode
            # Select an action from the policy
            action = select_action(state)
            next_state = step(state, action)
            
            # Get the reward for the action taken
            reward = rewards[next_state]

            # Update Critic: Value function using Bellman equation
            td_error = reward + gamma * value_table[next_state[0], next_state[1]] - value_table[state[0], state[1]]
            value_table[state[0], state[1]] += alpha * td_error

            # Update Actor: Policy function using policy gradient
            policy_table[state[0], state[1]] += alpha * td_error

            # Move to the next state
            state = next_state
            path_taken.append(state)

            if iteration % 10 == 0:  # Print every 10 iterations
                print(f"Iteration {iteration}, Step {step_count}:")
                print(value_table)
                print("\n")

            # Check if reached terminal state (goal or pitfall)
            if state == (3, 4) or state == (3, 0):
                break

        # Convergence check (optional)
        if np.max(np.abs(value_table - np.copy(value_table))) < theta:
            break

    return value_table, path_taken

# Running the Actor-Critic algorithm
final_value_table, path = actor_critic()

# Print the final value table and path
print("Final Value Table:")
print(final_value_table)

print("Path Taken:")
print(path)

Iteration 0, Step 0:
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


Iteration 0, Step 1:
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


Iteration 0, Step 2:
[[ 0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0. ]
 [-0.1  0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0. ]]


Final Value Table:
[[ 0.   0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0. ]
 [-0.1  0.   0.   0.   0. ]
 [ 0.   0.   0.   0.   0. ]]
Path Taken:
[(0, 0), (1, 0), (2, 0), (3, 0)]
