In [1]:
import numpy as np
from tabulate import tabulate

In [6]:
transition_probabilities = np.array([
    [0,0.8,0,0,0.2,0,0,0,0,0,0,0,0,0,0,0],
    [0.1,0,0.8,0,0,0.1,0,0,0,0,0,0,0,0,0,0],
    [0,0.1,0,0.8,0,0,0.1,0,0,0,0,0,0,0,0,0],
    [0,0,0.2,0,0,0,0,0.8,0,0,0,0,0,0,0,0],
    [0.1,0,0,0,0,0.8,0,0,0.1,0,0,0,0,0,0,0],
    [0,0.1,0,0,0,0,0.8,0,0,0.1,0,0,0,0,0,0],
    [0,0,0.1,0,0,0,0,0.8,0,0,0.1,0,0,0,0,0],
    [0,0,0,0.1,0,0,0.1,0,0,0,0,0.8,0,0,0,0],
    [0,0,0,0,0.1,0,0,0,0,0.8,0,0,0.1,0,0,0],
    [0,0,0,0,0,0.1,0,0,0,0,0.8,0,0,0.1,0,0],
    [0,0,0,0,0,0,0.1,0,0,0,0,0.8,0,0,0.1,0],
    [0,0,0,0,0,0,0,0.1,0,0,0.1,0,0,0,0,0.8],
    [0,0,0,0,0,0,0,0,0.2,0,0,0,0,0.8,0,0],
    [0,0,0,0,0,0,0,0,0,0.1,0,0,0.1,0,0.8,0],
    [0,0,0,0,0,0,0,0,0,0,0.1,0,0,0.1,0,0.8],
    [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]])

print(transition_probabilities.shape)
print(tabulate(transition_probabilities.tolist(), tablefmt='grid'))

(16, 16)
+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| 0   | 0.8 | 0   | 0   | 0.2 | 0   | 0   | 0   | 0   | 0   | 0   | 0   | 0   | 0   | 0   | 0   |
+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| 0.1 | 0   | 0.8 | 0   | 0   | 0.1 | 0   | 0   | 0   | 0   | 0   | 0   | 0   | 0   | 0   | 0   |
+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| 0   | 0.1 | 0   | 0.8 | 0   | 0   | 0.1 | 0   | 0   | 0   | 0   | 0   | 0   | 0   | 0   | 0   |
+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| 0   | 0   | 0.2 | 0   | 0   | 0   | 0   | 0.8 | 0   | 0   | 0   | 0   | 0   | 0   | 0   | 0   |
+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| 0.1 | 0   | 0   | 0   | 0   | 0.8 | 0   | 0   | 0.1 | 0   | 0   | 0   | 0   | 0   | 0   | 0   |
+-----+----

In [8]:
actionsMap = { action: i for i, action in enumerate(['up', 'down', 'left', 'right']) }
reverseActionsMap = { v: k for k, v in actionsMap.items() }
actionsMap, reverseActionsMap

({'up': 0, 'down': 1, 'left': 2, 'right': 3},
 {0: 'up', 1: 'down', 2: 'left', 3: 'right'})

In [54]:
def getValidActions(pos):
    row, col = pos

    actions = []
    if row > 0:
        actions.append(actionsMap['up'])
    if row < 3:
        actions.append(actionsMap['down'])
    if col > 0:
        actions.append(actionsMap['left'])
    if col < 3:
        actions.append(actionsMap['right'])
    return actions

def move(currentState, action):
    row, col = currentState
    if action == actionsMap['up']:
        row -= 1
    elif action == actionsMap['down']:
        row += 1
    elif action == actionsMap['left']:
        col -= 1
    elif action == actionsMap['right']:
        col += 1
    return (row, col)

In [25]:
actionsTable = np.array([
    [np.random.choice(getValidActions((i, j))) for j in range(4)] for i in range(4)
])

In [90]:
stateValues = np.random.uniform(1.0, 5.0, (4, 4))
stateValues

array([[2.2085902 , 4.84800419, 1.34356131, 4.41261137],
       [3.60804157, 1.40136552, 4.199238  , 2.24083579],
       [2.73952693, 2.75634132, 2.18349419, 1.73588675],
       [4.71738622, 3.10766542, 3.84839756, 4.13774734]])

In [91]:


class Environment:

    def __init__(self):
        self.statesMap = {
            'valid': 0,
            'dead': -1,
            'goal': 1
        }
        self.actionsMap = actionsMap
        self.reverseActionsMap = reverseActionsMap
        self.state = None
        self.currentState = None
        self.qtable = {}

        self.initialize()

    def initialize(self):
        self.state = np.zeros((4, 4), dtype=int)
        self.state[1][1] = self.statesMap['dead']
        self.state[3][3] = self.statesMap['goal']
        self.currentState = (0, 0)

    def reset(self):
        self.currentState = (0, 0)
        return self.currentState

    def displayState(self):
        copy = self.state.copy().tolist()
        row, col = self.currentState
        copy[row][col] = 'x'
        print(tabulate(copy, tablefmt='grid'))

    def getValidActions(self, state=None, pos=None):
        if state is None:
            row, col = pos if pos else self.currentState
        else:
            row, col = state

        actions = []
        if row > 0:
            actions.append(self.actionsMap['up'])
        if row < 3:
            actions.append(self.actionsMap['down'])
        if col > 0:
            actions.append(self.actionsMap['left'])
        if col < 3:
            actions.append(self.actionsMap['right'])
        return actions

    def move(self, action):
        row, col = self.currentState
        if action == self.actionsMap['up']:
            row -= 1
        elif action == self.actionsMap['down']:
            row += 1
        elif action == self.actionsMap['left']:
            col -= 1
        elif action == self.actionsMap['right']:
            col += 1
        return (row, col)

    def step(self, action):
        next_state = self.move(action)
        self.currentState = next_state

        # Handle reward
        if next_state == (1, 1):  # dead
            return next_state, -1, True
        elif next_state == (3, 3):  # goal
            return next_state, 10, True
        else:
            return next_state, -0.1, False
        
        
# env = Environment()
# alpha = 0.1
# gamma = 0.9
# epsilon = 0.2
# episodes = 1000

# for ep in range(episodes):
#     state = env.reset()
#     done = False
#     R = 0

#     while not done:
#         if state not in env.qtable:
#             env.qtable[state] = { a: 0.0 for a in env.actionsMap.values() }

#         # ε-greedy action selection
#         if np.random.rand() < epsilon:
#             action = np.random.choice(env.getValidActions(state))
#         else:
#             valid = { k: v for k, v in env.qtable[state].items() if k in env.getValidActions(state) }
#             action = max(valid, key=valid.get)

#         next_state, reward, done = env.step(action)
#         R += reward

#         if next_state not in env.qtable:
#             env.qtable[next_state] = { a: 0.0 for a in env.actionsMap.values() }

#         # Q-learning update
#         print(state, env.getValidActions(state))
#         env.qtable[state][action] = env.qtable[state][action] + alpha * (
#             reward + gamma * max(env.qtable[next_state].values()) - env.qtable[state][action]
#         )
#         state = next_state
    
#     print(f"episode: {ep}, reward: {R}")
#     R = 0


# grid = [['' for _ in range(4)] for _ in range(4)]

# for r in range(4):
#     for c in range(4):
#         state = (r, c)
#         if state == (1,1):
#             grid[r][c] = 'x'
#         elif state == (3,3):
#             grid[r][c] = 'G'
#         elif state in env.qtable:
#             best_action = max(env.qtable[state], key=env.qtable[state].get)
#             grid[r][c] = env.reverseActionsMap[best_action][0].upper()
#         else:
#             grid[r][c] = '?'

# print("\nLearned Policy:")
# print(tabulate(grid, tablefmt='grid'))

# print(tabulate([k for k in env.qtable.values()]))

In [None]:
def policyEvaluation(env: Environment):
    delta = float('inf')
    while delta > 1:
        delta = 0

        # for all states
        for i in range(4):
            for j in range(4):
                env.currentState = (i, j)
                v = stateValues[i][j]
                action = actionsTable[i][j]
                next_state, reward, done = env.step(action)

                print(f"current step: ({i}, {j}) | valid action: {reverseActionsMap[action]} | next step: ({next_state[0], next_state[1]})")
                probability = transition_probabilities[(i * 4) + j][(next_state[0] * 4) + next_state[1]]
                stateValues[i][j] = probability * (reward + v)
                print(f"prob: {probability}, reward: {reward}, v: {v}")
                delta = max(delta, np.abs(v - stateValues[i][j]))

        print(delta)
    print("finished policy evaluation", delta)

In [93]:
env = Environment()
policyEvaluation(env)

current step: (0, 0) | valid action: right | next step: ((0, 1))
prob: 0.8, reward: -0.1, v: 2.208590201798891
current step: (0, 1) | valid action: left | next step: ((0, 0))
prob: 0.1, reward: -0.1, v: 4.848004189735375
current step: (0, 2) | valid action: left | next step: ((0, 1))
prob: 0.1, reward: -0.1, v: 1.343561312563729
current step: (0, 3) | valid action: left | next step: ((0, 2))
prob: 0.2, reward: -0.1, v: 4.4126113674113405
current step: (1, 0) | valid action: up | next step: ((0, 0))
prob: 0.1, reward: -0.1, v: 3.6080415659481995
current step: (1, 1) | valid action: down | next step: ((2, 1))
prob: 0.1, reward: -0.1, v: 1.4013655157282083
current step: (1, 2) | valid action: down | next step: ((2, 2))
prob: 0.1, reward: -0.1, v: 4.199238002625241
current step: (1, 3) | valid action: up | next step: ((0, 3))
prob: 0.1, reward: -0.1, v: 2.2408357940269674
current step: (2, 0) | valid action: right | next step: ((2, 1))
prob: 0.8, reward: -0.1, v: 2.7395269324841958
current

In [95]:
print(tabulate(stateValues, tablefmt='grid'))

+------------+------------+------------+------------+
| -0.175924  | -0.0111111 | -0.0111111 | -0.0249999 |
+------------+------------+------------+------------+
| -0.0111111 | -0.0111111 | -0.0111111 | -0.0111111 |
+------------+------------+------------+------------+
| -0.130317  | -0.128872  | -0         | 36.7131    |
+------------+------------+------------+------------+
|  0.0395801 | -0.0111111 | -0.0111111 | -0         |
+------------+------------+------------+------------+
