In [None]:
!pip install gym



In [None]:
#!/usr/bin/env python3
""" Temporal Difference:
    Task 0 - Monte Carlo Method
"""


import numpy as np


def monte_carlo(env, V, policy, episodes=5000,
                max_steps=100, alpha=0.1, gamma=0.99):
    """
    function that performs the Monte Carlo Algorithm
    - env: is the openAI environment instance
    - V: is a numpy.ndarray of shape (s,)
         containing the value estimate
    - policy: is a function that takes in a state
              and returns the next action to take
    - episodes: is the total number of episodes
                to train over
    - max_steps: is the maximum number of steps
                 per episode
    - alpha: is the learning rate
    - gamma: is the discount rate
    Returns: V, the updated value estimate
    """
    for episode in range(episodes):
      # init empty list to store trajectory
      trajectory = []

      # reset env for new episode
      state = env.reset()

      for step in range(max_steps):
        # action chosen; using policy func.
        action = policy(state)
        # observing chosen action
        next_state, reward, done, _ = env.step(action)
        trajectory.append((state, action, reward))
        # update the current state
        state = next_state

        # break out of loop if episode is done.
        if done:
            break


    # update value using monte carlo method.
    # init return
    G = 0
    # recording value of state at first visit
    first_first_val = set()

    for state, action, reward in reversed(trajectory):
      # update reward using discount factor.
      G = gamma * G + reward
      if state not in first_first_val:
        first_first_val.add(state)
        V[state] = V[state] + alpha * (G + V[state])

    return V

if __name__ == "__main__":


  import gym
  import numpy as np

  np.random.seed(0)

  env = gym.make('FrozenLake8x8-v1')
  LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3

  def policy(s):
      p = np.random.uniform()
      if p > 0.5:
          if s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
              return RIGHT
          elif s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
              return DOWN
          elif s // 8 != 0 and env.desc[s // 8 - 1, s % 8] != b'H':
              return UP
          else:
              return LEFT
      else:
          if s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
              return DOWN
          elif s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
              return RIGHT
          elif s % 8 != 0 and env.desc[s // 8, s % 8 - 1] != b'H':
              return LEFT
          else:
              return UP

  V = np.where(env.desc == b'H', -1, 1).reshape(64).astype('float64')
  np.set_printoptions(precision=2)
  env.seed(0)
  print(monte_carlo(env, V, policy).reshape((8, 8)))


[[ 1.1  1.1  1.   1.   1.   1.   1.   1. ]
 [ 1.1  1.1  1.   1.   1.   1.   1.   1. ]
 [ 1.1  1.1  1.1 -1.   1.   1.   1.   1. ]
 [ 1.   1.   1.   1.   1.  -1.   1.   1. ]
 [ 1.   1.   1.  -1.   1.   1.   1.   1. ]
 [ 1.  -1.  -1.   1.   1.   1.  -1.   1. ]
 [ 1.  -1.   1.   1.  -1.   1.  -1.   1. ]
 [ 1.   1.   1.  -1.   1.   1.   1.   1. ]]


In [None]:
#!/usr/bin/env python3
"""
Temporal Difference:
Task 1 - TD(λ).
"""


import numpy as np



def td_lambtha(env, V, policy, lambtha, episodes=5000,
               max_steps=100, alpha=0.1, gamma=0.99):
  """
  function that performs the TD(λ) algorithm:

  env: is the openAI environment instance
  V: is a numpy.ndarray of shape (s,) containing
     the value estimate
  policy: is a function that takes in a state
          and returns the next action to take
  lambtha: is the eligibility trace factor
  episodes: is the total number of episodes
            to train over
  max_steps: is the maximum number of steps
             per episode
  alpha: is the learning rate
  gamma: is the discount rate

  Returns: V, the updated value estimate
  """

  for _ in range(episodes):
    # updating the current state
    state = env.reset()
    # array created to track eligibility
    eligibility = np.zeros_like(V)

    """
    Loop allows agent to interact w/env and
    selects actions based on policy, collects
    experiences and updates its value estimates
    """
    for _ in range(max_steps):
      action = policy(state)
      next_state, reward, done, _ = env.step(action)

      # computing the TD error
      delta = reward + gamma * V[next_state] - V[state]

      # updating eligibility trace 4 current state
      eligibility[state] += 1

      """
      Loop updates the value estimates for all states
      in env based on TD error & updates the eligibity
      traces for each state.
      """
      for s in range(V.shape[0]):
        V[s] += alpha * delta * eligibility[s]
        eligibility[s] *= lambtha * gamma

      if done:
          break

      # updating state
      state = next_state

  return V

if __name__ == "__main__":

  import gym
  import numpy as np

  np.random.seed(0)

  env = gym.make('FrozenLake8x8-v1')
  LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3

  def policy(s):
      p = np.random.uniform()
      if p > 0.5:
          if s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
              return RIGHT
          elif s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
              return DOWN
          elif s // 8 != 0 and env.desc[s // 8 - 1, s % 8] != b'H':
              return UP
          else:
              return LEFT
      else:
          if s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
              return DOWN
          elif s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
              return RIGHT
          elif s % 8 != 0 and env.desc[s // 8, s % 8 - 1] != b'H':
              return LEFT
          else:
              return UP

  V = np.where(env.desc == b'H', -1, 1).reshape(64).astype('float64')
  np.set_printoptions(precision=4)
  print(td_lambtha(env, V, policy, 0.9).reshape((8, 8)))


[[-0.8501 -0.8302 -0.8495 -0.7838 -0.7288 -0.7237 -0.6867 -0.6895]
 [-0.8824 -0.8829 -0.8819 -0.8526 -0.8205 -0.7441 -0.68   -0.8189]
 [-0.9108 -0.9199 -0.9537 -1.     -0.869  -0.7854 -0.6057 -0.7755]
 [-0.9326 -0.9399 -0.9507 -0.9431 -0.8995 -1.     -0.6609 -0.6571]
 [-0.9531 -0.9605 -0.9548 -1.     -0.929  -0.8948 -0.7488 -0.4105]
 [-0.9531 -1.     -1.      0.5122 -0.9688 -0.9453 -1.     -0.1842]
 [-0.9395 -1.     -0.5721 -0.0116 -1.     -0.5796 -1.      0.4346]
 [-0.9313 -0.9581 -0.9247 -1.      1.      1.1641  1.099   1.    ]]


In [None]:
#!/usr/bin/env python3
"""
Temporal Difference:
Task 2 - SARSA(λ)
"""

import numpy as np
import random

def epsilon_greedy_policy(Q, state, epsilon):
  if random.uniform(0, 1) < epsilon:
    return random.randint(0, Q.shape[1] -1)
  else:
    return np.argmax(Q[state])

def sarsa_lambtha(env, Q, lambtha, episodes=5000, max_steps=100,
                  alpha=0.1, gamma=0.99, epsilon=1,
                  min_epsilon=0.1, epsilon_decay=0.05):
  """
  function that performs SARSA(λ):

  env: is the openAI environment instance
  Q: is a numpy.ndarray of shape (s,a)
     containing the Q table
  lambtha: is the eligibility trace factor
  episodes: is the total number of episodes
            to train over
  max_steps: is the maximum number of
             steps per episode
  alpha: is the learning rate
  gamma: is the discount rate
  epsilon: is the initial threshold for
           epsilon greedy
  min_epsilon: is the minimum value that epsilon
               should decay to
  epsilon_decay: is the decay rate for updating
                 epsilon between episodes

  Returns: Q, the updated Q table
  """

  n_states, n_actions = Q.shape

  for _ in range(episodes):
    state = env.reset()
    eligibility = np.zeros_like(Q)
    action = epsilon_greedy_policy(Q, state, epsilon)

    for step in range(max_steps):
      next_state, reward, done, _ = env.step(action)
      next_action = epsilon_greedy_policy(Q, next_state, epsilon)

      # update SARSA
      delta = reward + gamma * Q[next_state, next_action] - Q[state, action]
      eligibility[state, action] += 1

      # update qvalue using the SARSA update rule.
      for s in range(n_states):
        for a in range(n_actions):
          Q[s, a] += alpha * delta * eligibility[s, a]
          eligibility[s, a] *= lambtha * gamma

      if done:
         break

      state = next_state
      action = next_action

    epsilon = max(min_epsilon, epsilon - epsilon_decay)

  return Q

if __name__ == "__main__":

  import gym
  import numpy as np

  np.random.seed(0)
  env = gym.make('FrozenLake8x8-v1')
  Q = np.random.uniform(size=(64, 4))
  np.set_printoptions(precision=4)
  print(sarsa_lambtha(env, Q, 0.9))


  deprecation(
  deprecation(


[[0.5851 0.6301 0.6096 0.6064]
 [0.6005 0.6069 0.5944 0.6324]
 [0.6729 0.6276 0.6039 0.598 ]
 [0.5817 0.6267 0.6156 0.6374]
 [0.673  0.6743 0.6713 0.6608]
 [0.6722 0.6911 0.6728 0.6642]
 [0.6593 0.7258 0.6783 0.6586]
 [0.7369 0.6752 0.6429 0.6541]
 [0.6256 0.6412 0.633  0.5487]
 [0.5862 0.6121 0.6214 0.6105]
 [0.6115 0.6648 0.6241 0.6222]
 [0.5387 0.5763 0.5637 0.6578]
 [0.6589 0.6787 0.7121 0.6602]
 [0.7133 0.7593 0.7051 0.7266]
 [0.7284 0.6827 0.735  0.6008]
 [0.5654 0.6283 0.7634 0.3472]
 [0.6695 0.6877 0.6689 0.6614]
 [0.6554 0.7322 0.657  0.6657]
 [0.6874 0.6054 0.5139 0.5311]
 [0.2828 0.1202 0.2961 0.1187]
 [0.5902 0.595  0.769  0.5407]
 [0.7737 0.7474 0.77   0.749 ]
 [0.6951 0.7502 0.6833 0.7614]
 [0.6622 0.7203 0.6135 0.7308]
 [0.7012 0.6844 0.6836 0.7395]
 [0.7279 0.7224 0.8011 0.7274]
 [0.7014 0.7879 0.71   0.6458]
 [0.5208 0.8016 0.5745 0.5892]
 [0.7373 0.747  0.7401 0.7725]
 [0.8811 0.5813 0.8817 0.6925]
 [0.8327 0.7426 0.7704 0.7373]
 [0.6794 0.839  0.6408 0.5798]
 [0.764 