# Q-Learning

## Hyperparameters

In [11]:
import numpy as np
import time
from IPython.display import clear_output

goal_reward = 1
nongoal_reward = -0.2
learning_rate =0.5
discount = 0.9
initial_q_value_range = (-2,0)

## Environment Setup

In [12]:
class Env():
    def __init__(self):
        self.height = 5
        self.width = 5
        self.posX = 0
        self.posY = 0
        self.endX = self.width-1
        self.endY = self.height-1
        self.actions = [0, 1, 2, 3]
        self.stateCount = self.height*self.width
        self.actionCount = len(self.actions)
        self.step_count = 0

    def reset(self):
        self.posX = 0
        self.posY = 0
        self.step_count = 0
        self.done = False
        return 0, 0, False

    # take action
    def step(self, action):
        if action==0: # left
            self.posX = self.posX-1 if self.posX>0 else self.posX
        if action==1: # right
            self.posX = self.posX+1 if self.posX<self.width-1 else self.posX
        if action==2: # up
            self.posY = self.posY-1 if self.posY>0 else self.posY
        if action==3: # down
            self.posY = self.posY+1 if self.posY<self.height-1 else self.posY

        done = self.posX==self.endX and self.posY==self.endY;
        # mapping (x,y) position to number between 0 and 5x5-1=24
        nextState = self.width*self.posY + self.posX
        reward = goal_reward if done else nongoal_reward
        self.step_count += 1
        return nextState, reward, done

    # display environment
    def render(self):
        ret = ""
        for i in range(self.height):
            for j in range(self.width):
                if self.posY==i and self.posX==j:
                    ret += "O"
                elif self.endY==i and self.endX==j:
                    ret += "T"
                else:
                    ret += "."
            ret += "\n"
        print(ret)

## Q-Learning Algorithm

In [13]:
# create environment
env = Env()

# QTable : contains the Q-Values for every (state,action) pair
qtable = np.random.uniform(low=initial_q_value_range[0], high=initial_q_value_range[1], size=(env.stateCount, env.actionCount))

# training loop
epochs = 50
for i in range(epochs):
    state, reward, done = env.reset()

    while not done:
        clear_output(wait=True)
        print("epoch #", i+1, "/", epochs)
        env.render()
        print(qtable)
        
        time.sleep(0.05) #So that movement is visible
        
        # Choose an action to take
        action = np.argmax(qtable[state])
        current_q = qtable[state][action]

        # take action
        next_state, reward, done = env.step(action)

        # update qtable value with update equation
        qtable[state][action] = (1-learning_rate) * current_q + learning_rate * (reward + discount * np.max(qtable[next_state]))

        # update state
        state = next_state
    
    print(f"Done in {env.step_count} steps")
    time.sleep(0.8)

epoch # 50 / 50
.....
.....
.....
....O
....T

[[-1.83077654 -1.95556002 -1.83084428 -1.13628939]
 [-1.50234559 -0.03561862 -0.69040615 -0.41889164]
 [-0.96329892 -0.59240175 -0.38480063 -1.37121019]
 [-1.59958144 -0.64050959 -1.31170459 -1.44067526]
 [-0.648666   -1.36943078 -1.94092155 -0.68779483]
 [-1.32325768 -1.99525826 -1.33128705 -1.04030156]
 [-0.96533384 -0.86231797 -1.00649656 -1.80688517]
 [-1.42071477 -1.6557604  -0.7278709  -1.770733  ]
 [-1.43897701 -1.00036976 -0.76442341 -1.02371161]
 [-0.79189807 -0.73406129 -1.6535863  -0.64318289]
 [-1.24186386 -1.43014671 -1.27119911 -0.9336623 ]
 [-0.8520989  -0.19414459 -0.79045847 -0.79556819]
 [-1.12173154  0.00650602 -1.14772462 -1.54075289]
 [-0.72320912  0.22945113 -0.75926177 -1.78595418]
 [-0.89649617 -0.67323759 -0.66538607  0.47716792]
 [-1.59049644 -1.88257267 -1.36626999 -0.81517869]
 [-1.53689714 -1.50045791 -0.37473013 -1.8238872 ]
 [-1.9523384  -0.5885081  -1.34644282 -1.29496173]
 [-1.68392684 -1.2993794  -0.246684