# Homework Assignment Week 4 - Q-Learning <a class="tocSkip">

This weeks homework assignment is to implement Q learning from scratch for the gridworld environment. Use this [repository](https://github.com/rlcode/reinforcement-learning/tree/master/1-grid-world) as a guide, but try not to peak at the Q learning code, recreate it, then check your code with it. Good luck!


<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Exercise" data-toc-modified-id="Exercise-1">Exercise</a></span></li></ul></div>

# Exercise

In [1]:
# import libraries
import pixiedust
import numpy as np
import pprint
from grid_world import standard_grid
from utils import print_values, print_policy

Pixiedust database opened successfully


In [69]:
# Global variables
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')
GAMMA = 0.95 # Discount factor from Bellman Equation
# Track positions of interest in the grid
POSITIONS = {
    'START': (2, 0),
    'WIN': (0, 3),
    'LOSE': (1, 3)
}


# Helper Functions   
def display_q(Q):
    print('\nQ Table:')
    pprint.pprint(Q)

    
def report_q(Q):
    for k in sorted(Q.keys()):
        print(f"{str(k):>6}", end='->  ')
        for j in sorted(Q[k].keys()):
            print(f"{j}: {Q[k][j]:>8.3}", end=', ')
        print("")

        
def initialize_q(grid):
    '''initialize Q(s,a) and returns'''
    Q = {}
    states = grid.non_terminal_states()
    for s in states:
        Q[s] = {}
        for a in ALL_POSSIBLE_ACTIONS:
            Q[s][a] = 0.0
    return Q


def epsilon_action(Q, state, epsilon=0.1):
    '''epsilon greedy action selection'''
    status = f'State: {state}'
    #print(f'\nState: {state}')
    if np.random.random() < (1 - epsilon):
        status += ". Following policy..."
        #print(Q[state])
        #action = np.argmax(Q[state])
        action = max(Q[state], key=Q[state].get)
    else:
        status += ". Following random policy..."
        action = np.random.choice(ALL_POSSIBLE_ACTIONS)
    
    status += f" Taking action '{action}'..."
    #print(f"\nStatus: '{status}'")
    return action


def update_q(Q, prev_state, action, reward, curr_state):
    #report_q(Q)
    alpha = 0.1 # Learning Rate
    #Q[prev_state][action] += alpha * (reward + GAMMA * max(Q[curr_state]) - Q[prev_state][action])
    Q[prev_state][action] += alpha * (reward + GAMMA * Q[curr_state][max(Q[curr_state])] - Q[prev_state][action])
    

# Main Program
def main(num_episodes=1000, epsilon=0.2, episode_window=1000):
    
    def print_episode_status():        
        if curr_state in Q:
            max_q = max(Q[curr_state])
        else:
            max_q = "Terminal State"
        
        print(f"Turn: {turn:>5}| "
            f"Previous State: {str(prev_state):>5}| "
            f"Action: {action}| "
            f"Actual Action: {actual_action}| "
            f"\nReward: {reward:>5.3f}| "
            f"Curr State: {str(curr_state):>5}| "
            f"Max Q: {max_q:>15}")
        
        
    # this grid gives you a reward of -0.1 for every non-terminal state
    # we want to see if this will encourage finding a shorter path to the goal
    grid = standard_grid(obey_prob=0.5, step_cost=-.5)

    # print rewards
    print("rewards:")
    print_values(grid.rewards, grid)
    
    Q = initialize_q(grid)
    #print('Initial Q:')
    #report_q(Q)
    
    total_reward = 0
    
    for episode in range(num_episodes + 1):
        
        if episode % episode_window == 0 and episode != 0:
            avg_reward = total_reward/episode_window
            print(f"\nEpisode = {episode}| Avg Reward = {avg_reward}")
            #report_q(Q)
            total_reward = 0
        
        #if episode % 10000 == 0:
        
        # reset our position to starting position for new episode
        if episode != 0:
            grid.set_state(POSITIONS['START'])
        
        turn = 0
        curr_state = grid.current_state()
        #print(f"\n\nStarting episode {episode} with current state at {curr_state}")
        
        while not grid.game_over():
            prev_state = curr_state
            action = epsilon_action(Q, curr_state, epsilon=epsilon)
            actual_action, reward = grid.move(action)
            total_reward += reward
            curr_state = grid.current_state()

            #print_episode_status()
            
            # check if we reached end points
            if grid.is_terminal(curr_state):
                if curr_state == POSITIONS['WIN']:
                    win_or_lose = 'WON'
                else:
                    win_or_lose = 'LOST'
                #print(f"\nYOU {win_or_lose} episode {episode} with {turn + 1} turns!")
                break

            update_q(Q, prev_state, action, reward, curr_state)
            turn += 1

        #print('Game over!')
    #display_values(Q, returns)

In [70]:
#%pixie_debugger
main(num_episodes=100000, episode_window=500)

rewards:
---------------------------
-0.50|-0.50|-0.50| 1.00|
---------------------------
-0.50| 0.00|-0.50|-1.00|
---------------------------
-0.50|-0.50|-0.50|-0.50|

Episode = 500| Avg Reward = -7.568

Episode = 1000| Avg Reward = -6.923

Episode = 1500| Avg Reward = -7.633

Episode = 2000| Avg Reward = -7.415

Episode = 2500| Avg Reward = -7.418

Episode = 3000| Avg Reward = -7.181

Episode = 3500| Avg Reward = -7.121

Episode = 4000| Avg Reward = -7.472

Episode = 4500| Avg Reward = -7.497

Episode = 5000| Avg Reward = -7.994

Episode = 5500| Avg Reward = -7.051

Episode = 6000| Avg Reward = -7.566

Episode = 6500| Avg Reward = -7.304

Episode = 7000| Avg Reward = -7.481

Episode = 7500| Avg Reward = -7.454

Episode = 8000| Avg Reward = -7.156

Episode = 8500| Avg Reward = -7.612

Episode = 9000| Avg Reward = -7.383

Episode = 9500| Avg Reward = -7.189

Episode = 10000| Avg Reward = -7.435

Episode = 10500| Avg Reward = -7.055

Episode = 11000| Avg Reward = -7.507

Episode = 11500