In [57]:
import numpy as np
import time
from IPython.display import clear_output

class Env():
    def __init__(self):
        self.height = 5;
        self.width = 5;
        self.posX = 0;
        self.posY = 0;
        self.endX = self.width-1;
        self.endY = self.height-1;
        self.actions = [0, 1, 2, 3];
        self.stateCount = self.height*self.width;
        self.actionCount = len(self.actions);
        self.step_count = 0

    def reset(self):
        self.posX = 0;
        self.posY = 0;
        self.done = False;
        return 0, 0, False;

    # take action
    def step(self, action):
        if action==0: # left
            self.posX = self.posX-1 if self.posX>0 else self.posX;
        if action==1: # right
            self.posX = self.posX+1 if self.posX<self.width-1 else self.posX;
        if action==2: # up
            self.posY = self.posY-1 if self.posY>0 else self.posY;
        if action==3: # down
            self.posY = self.posY+1 if self.posY<self.height-1 else self.posY;

        done = self.posX==self.endX and self.posY==self.endY;
        # mapping (x,y) position to number between 0 and 5x5-1=24
        nextState = self.width*self.posY + self.posX;
        reward = 1 if done else -0.02 #self.step_count * -0.02;
        self.step_count += 1
        return nextState, reward, done;

    # return a random action
    def randomAction(self):
        return np.random.choice(self.actions);

    # display environment
    def render(self):
        ret = ""
        for i in range(self.height):
            for j in range(self.width):
                if self.posY==i and self.posX==j:
                    ret += "O"
                elif self.endY==i and self.endX==j:
                    ret += "T"
                else:
                    ret += "."
            ret += "\n"
        print(ret)

In [58]:
# create environment
env = Env()

# QTable : contains the Q-Values for every (state,action) pair
qtable = np.random.uniform(low=0, high=0, size=(env.stateCount, env.actionCount))

# hyperparameters
epochs = 50
learning_rate = 0.1
discount = 0.9

# training loop
for i in range(epochs):
    state, reward, done = env.reset()
    steps = 0

    while not done:
        clear_output(wait=True)
        print("epoch #", i+1, "/", epochs)
        env.render()
        print(qtable)
        
        time.sleep(0.05)

        # count steps to finish game
        steps += 1
        
        # Choose an action to take
        action = np.argmax(qtable[state])
        current_q = qtable[state][action]

        # take action
        next_state, reward, done = env.step(action)

        # update qtable value with Bellman equation
        qtable[state][action] = (1-learning_rate) * current_q + learning_rate * (reward + discount * np.max(qtable[next_state]))

        # update state
        state = next_state
    

    print("\nDone in", steps, "steps".format(steps))
    time.sleep(0.8)

epoch # 50 / 50
.....
.....
.....
.....
...OT

[[-0.04443573 -0.04443217 -0.04443573 -0.04486465]
 [-0.03499424 -0.03285829 -0.03309725 -0.03282883]
 [-0.02463361 -0.02357708 -0.02272303 -0.02268104]
 [-0.01712583 -0.01728682 -0.01729655 -0.01759224]
 [-0.01441674 -0.01358693 -0.01358693 -0.01225855]
 [-0.03309725 -0.03316766 -0.03317162 -0.03277097]
 [-0.02518521 -0.02443366 -0.02577418 -0.00760207]
 [-0.01685413 -0.01697328 -0.01678863  0.00663186]
 [-0.01262574 -0.01079747 -0.01257108 -0.00796228]
 [-0.00806451 -0.0078808  -0.00848979  0.00705977]
 [-0.02272303 -0.0054274  -0.02250107 -0.02202761]
 [-0.01613683  0.10486383 -0.01745828 -0.01596588]
 [-0.01026761 -0.01003134 -0.01080235  0.29333878]
 [-0.00608746 -0.0056162  -0.00644758  0.04889752]
 [-0.00398    -0.00398    -0.0043202   0.15620707]
 [-0.01545106 -0.01367286 -0.01520647 -0.01427071]
 [-0.00993177  0.03422499 -0.01071452 -0.00926823]
 [-0.00611516 -0.0056     -0.004304    0.53651978]
 [-0.00398     0.27720161 -0.00218 