# Frozen Lake - Value Iteration - RL

Solving Frozen Lake Problem using the value iteration.  DP and MDP.

### Import Dependencies

In [1]:
import gym
import numpy as np

### Environment Call

In [2]:
env = gym.make('FrozenLake-v0') 

  result = entry_point.load(False)


### Explore the environmental variables of FrozenLake

In [3]:
# The number of states
nS = env.env.nS
nS

16

In [4]:
# The number of actions
nA = env.env.nA
nA

4

In [5]:
# Check the initial [Transition_Probability, Next_State, Reward_Probability, flag]
env.env.P[2][3]

[(0.3333333333333333, 3, 0.0, False),
 (0.3333333333333333, 2, 0.0, False),
 (0.3333333333333333, 1, 0.0, False)]

### Initialization

In [6]:
gamma = 0.85

### Value Iteration Function

To solve the MDP via DP, first we initialize the value table with random numbers, then for each episode in the state we inspect every possible action and the sum is appended to a 'Q(s,a)' table, at the end, the best value of each action is stored in the value table and we stop when the difference of the real value and calculated values of the value table are not as different as the calcualted

In [7]:
def value_iter(env, gamma=1.0, nepisodes=10000, eps=1e-20):
    env.env.reset()
    v_table = np.zeros(nS)
    for i in range(nepisodes):
        v_new = v_table.copy()
        for s in range(nS):
            q_val = []
            for a in range(nA):
                next_state_rewards = []
                for T_prob, next_state, reward, _ in env.env.P[s][a]:
                    next_state_rewards.append(T_prob*(reward + gamma*v_new[next_state]))
                q_val.append(np.sum(next_state_rewards))
                v_table[s] = max(q_val)
        if (np.sum(abs(v_new - v_table)) <= eps):
            print ('Stopping in %d episodes' %(i+1))
            break
    return v_table

### Get The Optimal Values

The optimal values in a future will serve us to extract the optimal policy

In [8]:
optimal_value = value_iter(env=env, gamma=gamma)

Stopping in 197 episodes


In [9]:
print(optimal_value)

[0.03115544 0.02939836 0.04320513 0.02824951 0.0476495  0.
 0.0798852  0.         0.08936976 0.17840342 0.23874263 0.
 0.         0.30154673 0.58433243 0.        ]


### Extract the Optimal Policy

When we have the optimal value we now can compute the optimal policy.  For extracting evaluate each action using optimal values and it will give us the optimal policy for each state

In [10]:
def extract_policy(env, value_table, gamma=gamma):
    policy = np.zeros(env.env.nS) 
    for state in range(env.env.nS):
        Q_table = np.zeros(env.env.nA)
        for action in range(env.env.nA):
            for next_sr in env.env.P[state][action]: 
                trans_prob, next_state, reward_prob, _ = next_sr 
                Q_table[action] += (trans_prob * (reward_prob + gamma * value_table[next_state]))
        policy[state] = np.argmax(Q_table)
    
    return policy


In [11]:
optimal_policy = extract_policy(env=env, value_table=optimal_value)

In [12]:
print(optimal_policy)

[0. 3. 0. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]
