<a href="https://colab.research.google.com/github/krmonline/ReinforcementLearning/blob/main/bellman.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np

class Env:
    def __init__(self):
        self.height = 5
        self.width = 5
        self.posX = 0
        self.posY = 0
        self.endX = self.width-1
        self.endY = self.height-1
        self.actions = [0, 1, 2, 3]
        self.stateCount = self.height*self.width
        self.actionCount = len(self.actions)

    def reset(self):
        self.posX = 0
        self.posY = 0
        self.done = False
        return 0, 0, False

    # take action
    def step(self, action):
        if action == 0: # left
            self.posX = self.posX-1 if self.posX > 0 else self.posX
        if action == 1: # right
            self.posX = self.posX+1 if self.posX < self.width - 1 else self.posX
        if action == 2: # up
            self.posY = self.posY-1 if self.posY > 0 else self.posY
        if action == 3: # down
            self.posY = self.posY+1 if self.posY < self.height - 1 else self.posY

        done = self.posX == self.endX and self.posY == self.endY
        # mapping (x,y) position to number between 0 and 5x5-1=24
        nextState = self.width * self.posY + self.posX
        reward = 1 if done else 0
        return nextState, reward, done

    # return a random action
    def randomAction(self):
        return np.random.choice(self.actions)

    # display environment
    def render(self):
        for i in range(self.height):
            for j in range(self.width):
                if self.posY == i and self.posX == j:
                    print("O", end='')
                elif self.endY == i and self.endX == j:
                    print("T", end='')
                else:
                    print(".", end='')
            print("")

In [3]:
env = Env()

env.render()
env.step(1)
print("============")
env.render()
env.step(1)
print("============")
env.render()

O....
.....
.....
.....
....T
.O...
.....
.....
.....
....T
..O..
.....
.....
.....
....T


In [4]:
#import Env
import numpy as np
import time
import os

# create environment
env = Env()

# QTable : contains the Q-Values for every (state,action) pair
#qtable = np.random.rand(env.stateCount, env.actionCount).tolist()
qtable =  np.zeros((25,4)).tolist()

In [None]:
# hyperparameters
epochs = 1
gamma = 0.1
epsilon = 0.08
decay = 0.1

# training loop
for i in range(epochs):
    state, reward, done = env.reset()
    steps = 0

    while not done:
        os.system('clear')
        #print("epoch #", i+1, "/", epochs)
        #env.render()
        #time.sleep(0.05)

        # count steps to finish game
        steps += 1

        # act randomly sometimes to allow exploration
        if np.random.uniform() < epsilon:
            action = env.randomAction()
        # if not select max action in Qtable (act greedy)
        else:
            action = qtable[state].index(max(qtable[state]))

        # take action
        next_state, reward, done = env.step(action)

        # update qtable value with Bellman equation
        qtable[state][action] = reward + gamma * max(qtable[next_state])

        # update state
        state = next_state
    # The more we learn, the less we take random actions
    epsilon -= decay*epsilon

    print("\nDone in", steps, "steps".format(steps))
    time.sleep(0.8)

In [None]:
            action = qtable[state].index(max(qtable[state]))

        # take action
        next_state, reward, done = env.step(action)

array([[6.77123414e-06, 1.93759287e-05, 5.00849117e-05, 5.00849117e-05],
       [6.42315702e-06, 1.81386395e-04, 6.35394962e-05, 1.38405238e-04],
       [6.35394962e-04, 9.09225398e-04, 6.79507711e-04, 8.77344516e-04],
       [9.09225398e-05, 9.28747283e-04, 9.09225398e-04, 9.09225398e-04],
       [1.81386395e-04, 9.28747283e-05, 1.81386395e-04, 7.42940387e-04],
       [7.79694360e-05, 4.69745996e-04, 6.35394962e-06, 8.26154819e-05],
       [9.39619828e-05, 8.90508305e-04, 7.58716056e-04, 7.34788945e-04],
       [7.71200763e-04, 2.37792634e-03, 6.64681373e-03, 9.82784227e-04],
       [7.94323843e-04, 5.93195177e-04, 4.13165105e-04, 2.84165770e-03],
       [7.82743910e-04, 7.82743910e-04, 1.81386395e-04, 4.48353617e-03],
       [4.69745996e-04, 7.79694360e-04, 7.79694360e-05, 5.30173413e-04],
       [4.69745996e-04, 6.26775040e-03, 6.42315702e-04, 1.08878800e-04],
       [1.93759287e-03, 2.94128600e-03, 4.39552286e-03, 2.45049909e-03],
       [1.91067096e-03, 6.47100462e-03, 9.75481223e

In [None]:
done = False
env.reset()
while not done:
        print("======")
        env.render()
        time.sleep(0.05)
        steps += 1
        action = qtable[state].index(max(qtable[state]))
        # take action
        next_state, reward, done = env.step(action)
        # update state
        state = next_state

O....
.....
.....
.....
....T
.O...
.....
.....
.....
....T
..O..
.....
.....
.....
....T
...O.
.....
.....
.....
....T
.....
...O.
.....
.....
....T
.....
....O
.....
.....
....T
.....
.....
....O
.....
....T
.....
.....
.....
....O
....T
