# Q-Learning

## Hyperparameters

In [None]:
import numpy as np
import time
from IPython.display import clear_output

goal_reward = 1
nongoal_reward = -0.2
learning_rate =0.5
discount = 2
initial_q_value_range = (-2,0)

## Environment Setup

In [None]:
class Env():
    def __init__(self):
        self.height = 5
        self.width = 5
        self.posX = 0
        self.posY = 0
        self.endX = self.width-1
        self.endY = self.height-1
        self.actions = [0, 1, 2, 3]
        self.stateCount = self.height*self.width
        self.actionCount = len(self.actions)
        self.step_count = 0

    def reset(self):
        self.posX = 0
        self.posY = 0
        self.step_count = 0
        self.done = False
        return 0, 0, False

    # take action
    def step(self, action):
        if action==0: # left
            self.posX = self.posX-1 if self.posX>0 else self.posX
        if action==1: # right
            self.posX = self.posX+1 if self.posX<self.width-1 else self.posX
        if action==2: # up
            self.posY = self.posY-1 if self.posY>0 else self.posY
        if action==3: # down
            self.posY = self.posY+1 if self.posY<self.height-1 else self.posY

        done = self.posX==self.endX and self.posY==self.endY;
        # mapping (x,y) position to number between 0 and 5x5-1=24
        nextState = self.width*self.posY + self.posX
        reward = goal_reward if done else nongoal_reward
        self.step_count += 1
        return nextState, reward, done

    # display environment
    def render(self):
        ret = ""
        for i in range(self.height):
            for j in range(self.width):
                if self.posY==i and self.posX==j:
                    ret += "O"
                elif self.endY==i and self.endX==j:
                    ret += "T"
                else:
                    ret += "."
            ret += "\n"
        print(ret)

## Q-Learning Algorithm

In [None]:
# create environment
env = Env()

# QTable : contains the Q-Values for every (state,action) pair
qtable = np.random.uniform(low=initial_q_value_range[0], high=initial_q_value_range[1], size=(env.stateCount, env.actionCount))

# training loop
epochs = 50
for i in range(epochs):
    state, reward, done = env.reset()

    while not done:
        clear_output(wait=True)
        print("epoch #", i+1, "/", epochs)
        env.render()
        print(qtable)
        
        time.sleep(0.05) #So that movement is visible
        
        # Choose an action to take
        #action = {Problem 1}
        #current_q = {Problem 1}

        # take action
        next_state, reward, done = env.step(action)

        # update qtable value with update equation
       # qtable[state][action] = {Problem 2}

        # update state
        state = next_state
    
    print(f"Done in {env.step_count} steps")
    time.sleep(0.8)