## Reinforcement Learning - Frozen Lake Sample Code

![alt text](https://miro.medium.com/max/842/1*Qp14HWQfOeE2UoSxrxCxAg.png)

#### Frozen Lake Simulation

In [0]:
# Frozen Lake Simulation
import numpy as np
import numpy.random as rnd

class Frozen_Lake_Sim:
  # Simulation of frozen lake 

  def __init__(self):
    # frozen lake constructor
    self.loc = (0,0) #current location
    self.state = 0 #current state
    self.actions_st = ['d','u','r','l'] #possible actions as str
    self.actions_rm = {'d':'u', 'u':'d', 'l':'r', 'r':'l'} #for removing opposite action
    self.actions_xy = {'d':(1,0), 'u':(-1,0), 'l':(0,-1), 'r':(0,1)} #action movement

    # map of lake
    self.lake =   np.array([['F', 'F', 'F', 'F'],
                            ['F', 'H', 'F', 'H'],
                            ['F', 'F', 'F', 'H'],
                            ['H', 'F', 'F', 'G']])
  
  def state(self):
    # retrive current state
    return self.state

  def retrieve(self, state, action):
    # retrive possible actions and states

    #possible actions
    possible_actions = list(self.actions_st)
    possible_actions.remove(self.actions_rm[action]) #removes opposite action

    #store information related to three possible actions
    T = np.zeros(3, dtype = np.float32)
    R = np.zeros(3, dtype = np.int32)
    r = np.zeros(3, dtype = np.int32)
    sp = np.ones(3, dtype = np.int32)*state

    #ensure only valid states are retrieved
    if state == 5 or state == 7 or state == 11 or state == 12:
      return T, R, sp, possible_action, r

    #obtain loc from state
    loc = (state // 4, state % 4)

    #retrive information for actions
    for ii in range(len(possible_actions)):
      a_st = possible_actions[ii]
      a_xy = self.actions_xy[a_st]
      y = loc[0] + a_xy[0]
      x = loc[1] + a_xy[1]
      
      #stay within the boundaries
      if not(x<0 or x>3 or y<0 or y>3):
        state = (y*4) + x
        sp[ii] = state

      #determine reward and if we reached end state
      reset, reward = 0, 0
      if state == 5 or state == 7 or state == 11 or state == 12:
        reset = 1
      elif state == 15:
        reward = 1
        reset = 1

      #update transition probabilities, reward and end state status  
      T[ii] = 0.33
      R[ii] = reward
      r[ii] = reset

    return T, R, sp, possible_actions, r

  def move(self, action):
    # make movement according to model
    
    #obtain possible actions from current state
    T, R, sp, possible_actions, r = self.retrieve(self.state, action)
    
    #select from possible actions (transition probability = 0.33)
    ii = rnd.choice(list(range(3)))
    
    #update state and location based on action made
    self.state = sp[ii]
    self.loc = (self.state // 4, self.state % 4)

    return self.state, possible_actions[ii], R[ii], r[ii]

#### Test Simulator

##### 1. Retrieval

In [0]:
FLS = Frozen_Lake_Sim()
#current state
print('current state:', FLS.state)

#retrive possible actions/end_states from state 14 if we tried to go up
T, R, sp, possible_actions, r = FLS.retrieve(14, 'u')
for ii in range(3):
  print(' action:', possible_actions[ii], 'end_state:', sp[ii], ' T:', T[ii], ' R:', R[ii], ' reset:', r[ii])

#current state IS NOT modified by retrieve method
print('current state:', FLS.state)


current state: 0
 action: u end_state: 10  T: 0.33  R: 0  reset: 0
 action: r end_state: 15  T: 0.33  R: 1  reset: 1
 action: l end_state: 13  T: 0.33  R: 0  reset: 0
current state: 0


##### 2. Movement

In [0]:
#current state
print('current state:', FLS.state)

#move
end_state, action, R, r = FLS.move('r')
print(' action:', action, 'end_state:', end_state, ' R:', R, ' reset:', r)
#current state IS modified by move method
print('current state:', FLS.state)

#move is random
end_state, action, R, r = FLS.move('r')
print(' action:', action, 'end_state:', end_state, ' R:', R, ' reset:', r)
#current state IS modified by move method
print('current state:', FLS.state)

#move is random
end_state, action, R, r = FLS.move('r')
print(' action:', action, 'end_state:', end_state, ' R:', R, ' reset:', r)
#current state IS modified by move method
print('current state:', FLS.state)

current state: 0
 action: d end_state: 4  R: 0  reset: 0
current state: 4
 action: d end_state: 8  R: 0  reset: 0
current state: 8
 action: d end_state: 12  R: 0  reset: 1
current state: 12


#### Value Iteration Algorithm

In [0]:
# value iteration sample code
FLS = Frozen_Lake_Sim()

#initialize Q matrix
Q = np.array(16*[4*[0]], dtype = np.float64)

#actions and states
actions = ['d','u','r','l']
states = [0,1,2,3,4,6,8,9,10,13,14] #excluding end states

discount_rate = 0.90
n_iterations = 2000

#apply value iteration algorithm
for iteration in range(n_iterations):
  Q_prev = Q.copy()

  for s in states:
    for a in range(4):
      #retrieve possible actions
      T, R, sp, _, _ = FLS.retrieve(s, actions[a])
      Q_temp = np.zeros(3, dtype = np.float64)
      #sum over possible actions
      for xx in range(len(T)):
        Q_temp[xx] = T[xx] * (R[xx] + discount_rate * np.max(Q_prev[sp[xx]]))
      Q[s, a] = np.sum(Q_temp)

#### Visualize Q Matrix

In [0]:
print(Q) # states x actions

[[0.05685286 0.05046533 0.05685286 0.05860123]
 [0.03705826 0.05271445 0.03530989 0.03306076]
 [0.06089757 0.0496871  0.06489507 0.06617405]
 [0.0340309  0.04840812 0.02875442 0.0340309 ]
 [0.06270351 0.04119666 0.05631597 0.08010807]
 [0.         0.         0.         0.        ]
 [0.08426605 0.01965369 0.10391974 0.10391974]
 [0.         0.         0.         0.        ]
 [0.10722274 0.13101483 0.09210343 0.06270351]
 [0.23000447 0.12317745 0.19109306 0.14573842]
 [0.25285988 0.09917549 0.21541272 0.28372405]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.29137557 0.25285988 0.3596869  0.17513834]
 [0.62137558 0.52109307 0.59881461 0.37564161]
 [0.         0.         0.         0.        ]]


#### Obtain Value Function

In [0]:
print(np.max(Q,axis = 1).reshape(4,4)) # value for each state
print(FLS.lake)

[[0.05860123 0.05271445 0.06617405 0.04840812]
 [0.08010807 0.         0.10391974 0.        ]
 [0.13101483 0.23000447 0.28372405 0.        ]
 [0.         0.3596869  0.62137558 0.        ]]
[['F' 'F' 'F' 'F']
 ['F' 'H' 'F' 'H']
 ['F' 'F' 'F' 'H']
 ['H' 'F' 'F' 'G']]


#### Obtain Policy

In [0]:
print(np.argmax(Q, axis = 1).reshape(4,4)) # optimal action for each state
print(FLS.lake)

[[3 1 3 1]
 [3 0 2 0]
 [1 0 3 0]
 [0 2 0 0]]
[['F' 'F' 'F' 'F']
 ['F' 'H' 'F' 'H']
 ['F' 'F' 'F' 'H']
 ['H' 'F' 'F' 'G']]


#### Q - Learning

In [0]:
# q-learning sample code
FLS = Frozen_Lake_Sim()

#initialize Q matrix
Q = np.array(16*[4*[0]], dtype = np.float64)

#states and actions
actions = ['d','u','r','l']
states = [0,1,2,3,4,6,8,9,10,13,14]
reset = 0

learning_rate = 0.05
n_iterations_1 = 200000
n_iterations_2 = 200
discount_rate = 0.9

#implement q-learning algorithm
for iteration_1 in range(n_iterations_1):
  #randomly select starting state
  s = rnd.choice(states)
  FLS.state = s
  for iteration_2 in range(n_iterations_2):
    #restart when we reach a hole or goal state
    if reset == 1:
      reset = 0
      break
    
    #randomly choose action
    a = rnd.choice([0,1,2,3])
    #make movement in simulation
    sp, a_actual, reward, reset = FLS.move(actions[a])
    
    term1 = (1-learning_rate) * Q[s, a]
    term2 = (learning_rate) * (reward + discount_rate * np.max(Q[sp]))
    Q[s, a] = term1 + term2
    s = sp # move to next state

#### Visualize Q Matrix

In [0]:
print(Q) # states x actions

[[0.1874649  0.17757906 0.18334424 0.19323277]
 [0.11104646 0.16953547 0.09817925 0.11144727]
 [0.16025747 0.15511725 0.16030573 0.17108729]
 [0.08410912 0.14963376 0.08195661 0.1050186 ]
 [0.17955983 0.13696377 0.13025654 0.21687732]
 [0.         0.         0.         0.        ]
 [0.13827286 0.05554225 0.16720656 0.18895715]
 [0.         0.         0.         0.        ]
 [0.23900484 0.28235982 0.18131458 0.14960372]
 [0.35755987 0.19981365 0.21159772 0.29712601]
 [0.34402453 0.20246355 0.28466481 0.33238822]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.3560656  0.31717908 0.49944287 0.26260102]
 [0.70196326 0.57906805 0.66890483 0.48841097]
 [0.         0.         0.         0.        ]]


#### Obtain Value Function

In [0]:
print(np.max(Q,axis = 1).reshape(4,4)) # value for each state
print(FLS.lake)

[[0.19323277 0.16953547 0.17108729 0.14963376]
 [0.21687732 0.         0.18895715 0.        ]
 [0.28235982 0.35755987 0.34402453 0.        ]
 [0.         0.49944287 0.70196326 0.        ]]
[['F' 'F' 'F' 'F']
 ['F' 'H' 'F' 'H']
 ['F' 'F' 'F' 'H']
 ['H' 'F' 'F' 'G']]


#### Obtain Policy

In [0]:
print(np.argmax(Q, axis = 1).reshape(4,4)) # optimal action for each state
print(FLS.lake)

[[3 1 3 1]
 [3 0 3 0]
 [1 0 0 0]
 [0 2 0 0]]
[['F' 'F' 'F' 'F']
 ['F' 'H' 'F' 'H']
 ['F' 'F' 'F' 'H']
 ['H' 'F' 'F' 'G']]


#### Thing to try
- assign -1 reward to holes
- implement epsilon greedy
- adjust learning rates