In [1]:
import numpy as np

# Define the grid world environment
GRID_ROWS, GRID_COLS = 4, 5  # 4x5 Grid World
actions = ['up', 'down', 'left', 'right']
action_space_size = len(actions)
action_symbols = ['U', 'D', 'L', 'R']  # For display purposes

In [2]:
# Define the policy network parameters (random initialization for actor)
theta = np.random.rand(GRID_ROWS, GRID_COLS, action_space_size)

# Define the value function (critic)
value_function = np.zeros((GRID_ROWS, GRID_COLS))

# Define learning rates
alpha_theta = 0.1  # Policy (actor) learning rate
alpha_v = 0.1      # Value (critic) learning rate
gamma = 0.9        # Discount factor for future rewards

# Define the rewards grid
rewards = np.zeros((GRID_ROWS, GRID_COLS))
rewards[3, 4] = 1  # Goal position

# Function to print reward table
def print_reward_table():
    print("Reward Table:")
    print(rewards)
    print("\n")
print_reward_table()

Reward Table:
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]]




In [3]:
# Policy softmax function to get action probabilities
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

# Function to choose action based on policy (probabilities from softmax)
def choose_action(state):
    row, col = state
    action_probs = softmax(theta[row, col])
    return np.random.choice(len(actions), p=action_probs)

# Function to take a step in the grid world
def take_step(state, action):
    row, col = state

    if action == 0:  # Up
        new_state = (max(0, row - 1), col)
    elif action == 1:  # Down
        new_state = (min(GRID_ROWS - 1, row + 1), col)
    elif action == 2:  # Left
        new_state = (row, max(0, col - 1))
    else:  # Right
        new_state = (row, min(GRID_COLS - 1, col + 1))

    reward = rewards[new_state]
    return new_state, reward

In [4]:
# Actor-Critic Update
def actor_critic_update(state, action, reward, next_state):
    row, col = state
    next_row, next_col = next_state
    
    # Critic Update (value function)
    td_target = reward + gamma * value_function[next_row, next_col]
    td_error = td_target - value_function[row, col]
    value_function[row, col] += alpha_v * td_error  # Update value function
    
    # Actor Update (policy gradient)
    action_probs = softmax(theta[row, col])
    for a in range(action_space_size):
        grad_log_pi = (1 if a == action else 0) - action_probs[a]  # ∇logπ for each action
        theta[row, col, a] += alpha_theta * td_error * grad_log_pi  # Update policy

# Function to display policy and probabilities side-by-side
def display_policy():
    direction_grid = np.full((GRID_ROWS, GRID_COLS), '', dtype='<U1')
    probability_grid = np.zeros((GRID_ROWS, GRID_COLS))

    for row in range(GRID_ROWS):
        for col in range(GRID_COLS):
            action_probs = softmax(theta[row, col])
            max_action = np.argmax(action_probs)
            direction_grid[row, col] = action_symbols[max_action]  # Direction with highest prob
            probability_grid[row, col] = np.max(action_probs)      # Highest probability

    # Display both grids side by side
    print("Policy (Direction) | Probabilities (Max)")
    for row in range(GRID_ROWS):
        direction_row = " ".join(direction_grid[row])
        probability_row = " ".join(f"{prob:.2f}" for prob in probability_grid[row])
        print(f"{direction_row}      | {probability_row}")

In [5]:
# Actor-Critic algorithm implementation
def actor_critic(grid_iterations=100):
    for iteration in range(grid_iterations):
        state = (0, 0)  # Start state

        while state != (3, 4):  # Run until we reach the goal (bottom-right corner)
            action = choose_action(state)
            next_state, reward = take_step(state, action)
            
            # Update Actor and Critic based on the step
            actor_critic_update(state, action, reward, next_state)
            
            state = next_state

        # Every 10 iterations, print the policy and probabilities
        if (iteration + 1) % 10 == 0:
            print(f"Iteration {iteration + 1}:")
            display_policy()
            print("\n")

    # Final grid and path taken after training
    print("Final policy parameters (theta):")
    display_policy()

    # Show final path from start to goal
    state = (0, 0)
    path_taken = [state]
    while state != (3, 4):
        action = choose_action(state)
        state, _ = take_step(state, action)
        path_taken.append(state)

    print("Path taken:", path_taken)

# Call the function to run the Actor-Critic algorithm
actor_critic(100)  # Run for 100 iterations
actor_critic(100)  # Run for 100 iterations
actor_critic(100)  # Run for 100 iterations
actor_critic(100)  # Run for 100 iterations

Iteration 10:
Policy (Direction) | Probabilities (Max)
L R U D D      | 0.33 0.29 0.32 0.42 0.30
U D L D R      | 0.38 0.37 0.36 0.34 0.33
L D U R D      | 0.34 0.35 0.33 0.33 0.36
U R L D L      | 0.29 0.37 0.31 0.30 0.39


Iteration 20:
Policy (Direction) | Probabilities (Max)
L R U D D      | 0.33 0.29 0.32 0.42 0.30
U D L D R      | 0.38 0.37 0.35 0.35 0.32
L D U R D      | 0.34 0.36 0.33 0.32 0.39
U R R R L      | 0.29 0.38 0.32 0.36 0.39


Iteration 30:
Policy (Direction) | Probabilities (Max)
L R U D D      | 0.32 0.29 0.32 0.42 0.30
U D L D R      | 0.38 0.38 0.34 0.38 0.32
L D U R D      | 0.33 0.37 0.31 0.34 0.44
U R R R L      | 0.28 0.41 0.39 0.43 0.39


Iteration 40:
Policy (Direction) | Probabilities (Max)
L R U D D      | 0.32 0.29 0.32 0.43 0.31
U D L D R      | 0.38 0.38 0.34 0.40 0.31
L D U R D      | 0.33 0.37 0.31 0.34 0.51
U R R R L      | 0.28 0.43 0.43 0.48 0.39


Iteration 50:
Policy (Direction) | Probabilities (Max)
L R U D D      | 0.32 0.29 0.31 0.44 0.32
U D