* Sample Agent Environment Interface 

    - Environment has been modified from FrozenLake to create a 18*18 grid world with 4 determinstic actions
    - Moreover the slippery nature of some states has been made non slippery to ensure the environment has no stochasticity
    

In [1]:
import numpy as np
import gym

env = gym.make('FrozenLakeNotSlippery18x18-v0')
#env = gym.make('FrozenLakeNotSlippery-v1')
#env = gym.make('FrozenLake-v0')
#env = gym.make('FrozenLake8x8-v0')
for i_episode in range(2):
    observation = env.reset()
    for t in range(10):
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break

[2017-11-19 02:30:31,269] Making new env: FrozenLakeNotSlippery18x18-v0



[41mS[0mFFFFFFFFFFFFFFFFF
FFFFFFFFFFFFFFFFFF
FFFFFFFFHFFFFFFFFF
FFFFFFFFFHFFFFFFFF
FFFFFFFFHFFFFFFFFF
FFHHHHFFFFFFFFHHHF
FHHHHFFFFHHHFFHHHF
FFFFFFFFHFFFFFFFFG
0
  (Left)
[41mS[0mFFFFFFFFFFFFFFFFF
FFFFFFFFFFFFFFFFFF
FFFFFFFFHFFFFFFFFF
FFFFFFFFFHFFFFFFFF
FFFFFFFFHFFFFFFFFF
FFHHHHFFFFFFFFHHHF
FHHHHFFFFHHHFFHHHF
FFFFFFFFHFFFFFFFFG
0
  (Up)
[41mS[0mFFFFFFFFFFFFFFFFF
FFFFFFFFFFFFFFFFFF
FFFFFFFFHFFFFFFFFF
FFFFFFFFFHFFFFFFFF
FFFFFFFFHFFFFFFFFF
FFHHHHFFFFFFFFHHHF
FHHHHFFFFHHHFFHHHF
FFFFFFFFHFFFFFFFFG
0
  (Down)
SFFFFFFFFFFFFFFFFF
[41mF[0mFFFFFFFFFFFFFFFFF
FFFFFFFFHFFFFFFFFF
FFFFFFFFFHFFFFFFFF
FFFFFFFFHFFFFFFFFF
FFHHHHFFFFFFFFHHHF
FHHHHFFFFHHHFFHHHF
FFFFFFFFHFFFFFFFFG
18
  (Left)
SFFFFFFFFFFFFFFFFF
[41mF[0mFFFFFFFFFFFFFFFFF
FFFFFFFFHFFFFFFFFF
FFFFFFFFFHFFFFFFFF
FFFFFFFFHFFFFFFFFF
FFHHHHFFFFFFFFHHHF
FHHHHFFFFHHHFFHHHF
FFFFFFFFHFFFFFFFFG
18
  (Up)
[41mS[0mFFFFFFFFFFFFFFFFF
FFFFFFFFFFFFFFFFFF
FFFFFFFFHFFFFFFFFF
FFFFFFFFFHFFFFFFFF
FFFFFFFFHFFFFFFFFF
FFHHHHFFFFFFFFHHHF
FHHHHFFFFHHHFFHHHF

* Epsilon Greedy Function 

    - Chooses a greedy action most of the time but with a probability eps chooses a random action
    - Chooses random action with probability of eps; argmax Q(s, .) with probability of (1-eps)

In [2]:
def eps_greedy(q_vals, eps, state):
    """
    Inputs:
        q_vals: q value tables
        eps: epsilon
        state: current state
    Outputs:
        random action with probability of eps; argmax Q(s, .) with probability of (1-eps)
    """
    import random
    if random.random() <= eps:
        action = env.action_space.sample() # sample an action randomly # sample an action randomly
    else:
        action = np.argmax(q_vals[state,:])
    return action

* Q learning update function. After we observe a transition $s, a, s', r$,
     
     $$\textrm{target}(s') = R(s,a,s') + \gamma \max_{a'} Q_{\theta_k}(s',a')$$
     $$\textrm{delta}(s') = \textrm{target}(s') - Q_{\theta_k}(s',a')
     $$$$Q_{k+1}(s,a) \leftarrow Q_k(s,a) + \alpha * \left( \textrm{delta}(s') \right)$$


In [3]:
def q_learning_update(gamma, alpha, q_vals, cur_state, action, next_state, reward):
    """
    Inputs:
        gamma: discount factor
        alpha: learning rate
        q_vals: q value table
        cur_state: current state
        action: action taken in current state
        next_state: next state results from taking `action` in `cur_state`
        reward: reward received from this transition
    
    Performs in-place update of q_vals table to implement one step of Q-learning
    """
    delta = reward + gamma * np.max(q_vals[next_state,:]) - q_vals[cur_state,action]
    q_vals[cur_state,action] = q_vals[cur_state,action] + alpha * delta

* Tabular Q Learning 

In [4]:
env = gym.make('FrozenLake-v0')
#env = gym.make('FrozenLakeNotSlippery18x18-v0')

Q = np.zeros([env.observation_space.n,env.action_space.n])
gamma = 0.95
alpha = 0.8
epsilon = 0.1
episodes_num = 200000
rList = []
for itr in range(episodes_num):
    cur_state = env.reset()
    ret = 0
    done = False
    while not done:
        #action = eps_greedy(Q, epsilon, cur_state)
        #print(action)
        action = np.argmax(Q[cur_state,:] + np.random.randn(1,env.action_space.n)*(1./(itr+1)))
        next_state, reward, done, info = env.step(action)
        q_learning_update(gamma, alpha, Q, cur_state, action, next_state, reward)
        #Q[cur_state,action] = Q[cur_state,action] + alpha*(reward + gamma*np.max(Q[next_state,:]) - Q[cur_state,action])
        cur_state = next_state
        ret+=reward
    rList.append(ret)
    #epsilon = max(epsilon-0.002,0.1)
print ("Score over time: " +  str(sum(rList)/episodes_num))
print("Q-values: %s" %Q)

[2017-11-19 02:30:39,414] Making new env: FrozenLake-v0


Score over time: 0.99977
Q-values: [[ 0.          0.77378094  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.81450625  0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.646912    0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.857375    0.        ]
 [ 0.          0.          0.9025      0.        ]
 [ 0.          0.95        0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          1.          0.        ]
 [ 0.          0.          0.          0.        ]]
