In [1]:
import numpy as np

In [2]:
states = 6  # S0, S1, S2, S3, S4, S5
actions = 6  # The possible actions are to move to any of the 6 states
gamma = 0.9  # Discount factor

# Initializing the Q-table with Zeroes
Q = np.zeros((states, actions))
print("Initial Q-table")
print(Q)
print()

# Intializing the Reward Table
R = np.full((states, actions), -1)  # Default reward is -1
# Reward for Reaching Goal State is 100
R[1, 5] = R[4, 5] = R[5, 5] = 100
# Reward for Path Exising But Not Goal State is 0
R[0, 4] = R[1, 3] = R[2, 3] = R[3, 1] = R[3, 2] = R[3,
                                                    4] = R[4, 0] = R[4, 3] = R[5, 1] = R[5, 4] = 0

print("Reward Table")
print(R)
print()

episodes_paths = [
    [1, 3, 4, 5],
    [0, 4, 5],
    [4, 0, 4, 5]
]

# Function to update the Q-value


def update_q(state, action, reward, next_state):
    max_next_q = np.max(Q[next_state, :][Q[next_state, :] >= 0])
    Q[state, action] = reward + gamma * max_next_q
    print(f"Q[{state}, {action}]: {Q[state, action]}")


# Q Learning Algorithm
for episode, path in enumerate(episodes_paths):
    print(f"Episode {episode + 1} Path: {path}")
    for i in range(len(path)-1):
        state = path[i]
        next_state = path[i+1]
        action = next_state
        reward = R[state, action]
        print(f"State: {state}, Action: {action}, Reward: {reward}")
        update_q(state, action, reward, next_state)

    # Printing the Q-table
    print(f"Episode {episode + 1} Q-table:")
    print(Q)
    print()

Initial Q-table
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]

Reward Table
[[ -1  -1  -1  -1   0  -1]
 [ -1  -1  -1   0  -1 100]
 [ -1  -1  -1   0  -1  -1]
 [ -1   0   0  -1   0  -1]
 [  0  -1  -1   0  -1 100]
 [ -1   0  -1  -1   0 100]]

Episode 1 Path: [1, 3, 4, 5]
State: 1, Action: 3, Reward: 0
Q[1, 3]: 0.0
State: 3, Action: 4, Reward: 0
Q[3, 4]: 0.0
State: 4, Action: 5, Reward: 100
Q[4, 5]: 100.0
Episode 1 Q-table:
[[  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0. 100.]
 [  0.   0.   0.   0.   0.   0.]]

Episode 2 Path: [0, 4, 5]
State: 0, Action: 4, Reward: 0
Q[0, 4]: 90.0
State: 4, Action: 5, Reward: 100
Q[4, 5]: 100.0
Episode 2 Q-table:
[[  0.   0.   0.   0.  90.   0.]
 [  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0