## Import Packages
* gym - collection of environments for reinforcement learning algorithms.
* numpy - package for scientific computing with Python

In [None]:
import gym
import numpy as np

## Define the environment
The agent needs to navigate across a frozen lake with the snow melting in a few parts, to retreive a frisbee.
The surface is described using a grid like the following:
* SFFF       (S: starting point, safe)
* FHFH       (F: frozen surface, safe)
* FFFH       (H: hole, fall to your doom)
* HFFG       (G: goal, where the frisbee is located)

The episode ends when you reach the goal or fall in a hole. You receive a reward of 1 if you reach the goal, and zero otherwise.

In [None]:
env = gym.make("FrozenLake-v0")

## Spaces
* Every environment comes with an action_space and an observation_space. 
* These attributes are of type Space, and they describe the format of valid actions and observations.

In [None]:
env.action_space.n

In [None]:
env.observation_space

## Define parameters
* Define the learning rate (alpha) and the discount factor (gamma)
* Create a Q-Table and intiialize it with 1 for each state-action pair

In [None]:
alpha = 0.4

In [None]:
gamma = 0.999

In [None]:
q_table = dict([(x, [1, 1, 1, 1]) for x in range(16)])

In [None]:
q_table

## Define choose action method
* Pick the action with the highest q value

In [None]:
def choose_action(observ):
    return np.argmax(q_table[observ])

## Train the agent
* Train the agent for 10000 episodes.
* Invoke the reset method to start an episode.
* Choose an arbitarary initial action.
* Initialize previous observation and action
* Iterate over 2500 timsteps. For each timestep: 
    * Render the environment for visulaization.
    * Execute the chosen action and store the return values.
    * Choose the next action based on observations returned by the previous action.
    * If this is not the first action in the episode,
        * get the q value for the previous action and previous observation.
        * if s is not terminal,compute  Q(St, At)← Q(St, At) + α[Rt+1 + γQ(St+1, At+1) − Q(St, At)]. 
        * If the episode is over, diregard the discount factor when computing Q value.
        * update the q table
    * Update the previous observation and previous action parameters
    * If the episode has terminated, print out the info. 
* Note that the executed action is not always the chosen action. Some episodes terminate in goal and some in a hole.         
        

In [None]:
for i in range(10000):

    observ = env.reset()
    
    action = choose_action(observ)

    prev_observ = None
    prev_action = None

    t = 0
    
    for t in range(2500):
         
        env.render()

        observ, reward, done, info = env.step(action)

        action = choose_action(observ)

        if not prev_observ is None:

            q_old = q_table[prev_observ][prev_action]
            q_new = q_old

            if done:
                q_new += alpha * (reward - q_old)
                
            else:
                q_new +=  alpha * (reward + gamma * q_table[observ][action] - q_old)

            new_table = q_table[prev_observ]
            new_table[prev_action] = q_new

            q_table[prev_observ] = new_table

        prev_observ = observ
        prev_action = action
 
            print("Episode {} finished after {} timesteps with r={}.".format(i, t, reward))
            break

In [None]:
new_table

In [None]:
q_table