# Q-Learning

## Hyperparameters

In [10]:
import numpy as np
import time
from IPython.display import clear_output

goal_reward = 1
nongoal_reward = -0.2
learning_rate =0.5
discount = 2
initial_q_value_range = (-2,0)

## Environment Setup

In [11]:
class Env():
    def __init__(self):
        self.height = 5
        self.width = 5
        self.posX = 0
        self.posY = 0
        self.endX = self.width-1
        self.endY = self.height-1
        self.actions = [0, 1, 2, 3]
        self.stateCount = self.height*self.width
        self.actionCount = len(self.actions)
        self.step_count = 0

    def reset(self):
        self.posX = 0
        self.posY = 0
        self.step_count = 0
        self.done = False
        return 0, 0, False

    # take action
    def step(self, action):
        if action==0: # left
            self.posX = self.posX-1 if self.posX>0 else self.posX
        if action==1: # right
            self.posX = self.posX+1 if self.posX<self.width-1 else self.posX
        if action==2: # up
            self.posY = self.posY-1 if self.posY>0 else self.posY
        if action==3: # down
            self.posY = self.posY+1 if self.posY<self.height-1 else self.posY

        done = self.posX==self.endX and self.posY==self.endY;
        # mapping (x,y) position to number between 0 and 5x5-1=24
        nextState = self.width*self.posY + self.posX
        reward = goal_reward if done else nongoal_reward
        self.step_count += 1
        return nextState, reward, done

    # display environment
    def render(self):
        ret = ""
        for i in range(self.height):
            for j in range(self.width):
                if self.posY==i and self.posX==j:
                    ret += "O"
                elif self.endY==i and self.endX==j:
                    ret += "T"
                else:
                    ret += "."
            ret += "\n"
        print(ret)

## Q-Learning Algorithm

In [None]:
# create environment
env = Env()

# QTable : contains the Q-Values for every (state,action) pair
qtable = np.random.uniform(low=initial_q_value_range[0], high=initial_q_value_range[1], size=(env.stateCount, env.actionCount))

# training loop
epochs = 50
for i in range(epochs):
    state, reward, done = env.reset()

    while not done:
        clear_output(wait=True)
        print("epoch #", i+1, "/", epochs)
        env.render()
        print(qtable)
        
        time.sleep(0.05) #So that movement is visible
        
        # Choose an action to take
        #action = {Problem 1}
        #current_q = {Problem 1}

        # take action
        next_state, reward, done = env.step(action)

        # update qtable value with update equation
       # qtable[state][action] = {Problem 2}

        # update state
        state = next_state
    
    print(f"Done in {env.step_count} steps")
    time.sleep(0.8)

epoch # 1 / 50
O....
.....
.....
.....
....T

[[-1.49640058 -1.31325464 -0.62558035 -0.55308248]
 [-1.75068279 -1.06501282 -0.17574873 -1.94136892]
 [-0.29131243 -1.61434734 -0.20398675 -1.40514089]
 [-1.12162226 -1.19054469 -0.73640138 -0.52832421]
 [-1.26313646 -0.75375888 -0.79189059 -1.65325737]
 [-0.94056716 -0.32451554 -1.06043589 -1.29243144]
 [-0.90518226 -1.97565932 -1.41758584 -1.040678  ]
 [-0.57527565 -1.33149986 -1.07259632 -1.03215822]
 [-1.46076119 -0.07639011 -0.37897172 -0.59979697]
 [-0.80754897 -1.12811088 -1.50928597 -0.8076878 ]
 [-1.04746517 -1.62585299 -0.77962829 -1.97219865]
 [-0.46472806 -1.44276031 -1.50751724 -1.7641352 ]
 [-1.05888189 -1.87267569 -0.88196391 -1.43623992]
 [-0.19235552 -0.99699126 -0.01249772 -1.61213659]
 [-0.3420593  -0.75537159 -1.10704685 -1.23504836]
 [-0.00799782 -0.02538458 -0.28712833 -1.83678943]
 [-1.3986005  -1.4028128  -1.25416156 -0.35858636]
 [-1.5932906  -0.03481109 -0.12524989 -0.68744255]
 [-0.58961439 -0.84530592 -0.8168568