#  Assignment 4 - Student Information

Le Phuoc Vinh Linh

Student ID: 20521531

Class: CS106.M21.KHTN


# Import necessary libraries and tools

In [1]:
import gym
import numpy as np
import time
from IPython import display

# Initialize FrozenLake-v0 environment

In [2]:
env = gym.make('FrozenLake-v0')

In [3]:
env.P[0][3] # Transition model

[(0.3333333333333333, 1, 0.0, False),
 (0.3333333333333333, 0, 0.0, False),
 (0.3333333333333333, 0, 0.0, False)]

### The observed state space is 16

In [4]:
env.observation_space.n

16

### Action space is 4

In [5]:
env.action_space.n

4

# Implement

### Implement Value Iteration algorithm

In [6]:
def value_iteration(env, max_iters, gamma):
    # initialize
    v_values = np.zeros(env.observation_space.n)

    for i in range(max_iters):
        prev_v_values = np.copy(v_values)

        # Calculate value of state and update the v-value for each state
        for state in range(env.observation_space.n):
            q_values = []

            # Calculate q-value for each action that we can perform at the state
            for action in range(env.action_space.n):
                q_value = 0
                # Loop through each possible outcome
                for prob, next_state, reward, done in env.P[state][action]:
                    q_value += prob * (reward + gamma * prev_v_values[next_state])
                
                q_values.append(q_value)
            
            # Get the best action, select the max q-values
            best_action = np.argmax(q_values)
            v_values[state] = q_values[best_action]
        
        # Check convergence
        if np.all(np.isclose(v_values, prev_v_values)):
            print(f'Converged at {i}-th iteration.')
            break
    
    return v_values

In [7]:
v_values = value_iteration(env, max_iters=1000, gamma=0.9)

Converged at 79-th iteration.


### Implement Policy Extraction Method

In [8]:
def policy_extraction(env, v_values, gamma=0.9):
     # initialize
    policy = np.zeros(env.observation_space.n, dtype=np.int)

    # loop through each state in the environment
    for state in range(env.observation_space.n):
        q_values = []
        # Calculate q_value for each action
        for action in range(env.action_space.n):
            q_value = 0
            # loop each possible outcome
            for prob, next_state, reward, done in env.P[state][action]:
                q_value += prob * (reward + gamma * v_values[next_state])
            q_values.append(q_value)
        
        # Select the best action
        best_action = np.argmax(q_values)
        policy[state] = best_action
    
    return policy

In [9]:
policy = policy_extraction(env, v_values, gamma=0.9)
policy

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  This is separate from the ipykernel package so we can avoid doing imports until


array([0, 3, 0, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0])

### Implement Policy Iteration Algorithm

In [10]:
def policy_iteration(env, max_iters, gamma):

    # Initialization
    ini_pi = np.array([env.action_space.sample() for i in range(env.observation_space.n)])

    for i in range(max_iters):
        # Policy Evaluation
        v_values = np.zeros(env.observation_space.n)

        for j in range(max_iters):
            prev_v_values = np.copy(v_values)

            # Calculate value of state
            for state, action in enumerate(ini_pi):
                # Calculate q-value for each action
                q_value = 0
                # Loop through each possible outcome
                for prob, next_state, reward, done in env.P[state][action]:
                    q_value += prob * (reward + gamma * prev_v_values[next_state])
                v_values[state] = q_value
          
            # Check for convergence
            if np.all(np.isclose(v_values, prev_v_values)):
                break

        # Policy Improvement
        prev_pi = np.copy(ini_pi)
        for state in range(env.observation_space.n):
            q_values = []
            # Calculate q-value for each action
            for action in range(env.action_space.n):
                q_value = 0
                # Loop through each possible outcome
                for prob, next_state, reward, done in env.P[state][action]:
                    q_value += prob * (reward + gamma * v_values[next_state])
            
                q_values.append(q_value)

            # Get the best action
            best_action = np.argmax(q_values)
            ini_pi[state] = best_action
            
        # Check convergence
        if np.all(np.isclose(ini_pi, prev_pi)):
            print(f'Converged at {i}-th iteration.')
            break
    
    return ini_pi

In [11]:
pi = policy_iteration(env, max_iters=1000, gamma=0.9)
pi

Converged at 5-th iteration.


array([0, 3, 0, 3, 0, 0, 0, 0, 3, 1, 0, 0, 0, 2, 1, 0])

### Play function

In [12]:
def play(env, policy, render=False):
    state = env.reset()
    total_reward = 0
    done = False
    steps = 0
    #time.sleep(1)
    
    while not done:
        action = policy[state]
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        
        if render:
            env.render()
            time.sleep(0.2)
            if not done:
                display.clear_output(wait=True)
        state = next_state

    return total_reward

In [13]:
play(env, policy)

1.0

### Play_multiple_times function

In [14]:
def play_multiple_times(env, policy, max_episodes):
    success = 0

    for i in range(max_episodes):
        reward = play(env, policy)

        if reward > 0:
            success += 1
    print(f'Number of successes: {success}/{max_episodes}')
    return success

In [15]:
play_multiple_times(env, policy, 1000)

Number of successes: 732/1000


732

The preceding calculation indicates that running 1000 times will result in 732 successes, but we need a more precise figure.

Run 2000 episodes, each eponodes run 1000 loops

In [None]:
# Initilize parameters
MAX_ITERS = 1000
MAX_EPISODES = 2000
GAMMA = 0.9

# Experiment

### Experiment with FrozenLake-v0 

In [None]:
env = gym.make('FrozenLake-v0')

vi_value = value_iteration(env, max_iters=MAX_ITERS, gamma=GAMMA)
policy_from_value = policy_extraction(env, vi_value, GAMMA)
start_vi = time.time()
vi_number_of_successes = play_multiple_times(env, policy=policy_from_value, max_episodes=MAX_EPISODES)
vi_time = time.time() - start_vi

pi = policy_iteration(env, max_iters=MAX_ITERS, gamma=GAMMA)
start_pi = time.time()
pi_number_of_successes = play_multiple_times(env, policy=pi, max_episodes=MAX_EPISODES)
pi_time = time.time() - start_pi

print(f'Number of successes of Value Iteration in FrozenLake8x8-v0 : {vi_number_of_successes}/{MAX_EPISODES}, Average time : {vi_time/MAX_EPISODES}s')
print(f'Number of successes of Policy Iteration in FrozenLake8x8-v0 : {pi_number_of_successes}/{MAX_EPISODES}, Average time : {pi_time/MAX_EPISODES}s')

Converged at 79-th iteration.


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


Number of successes: 1476/2000
Converged at 2-th iteration.
Number of successes: 1436/2000
Number of successes of Value Iteration in FrozenLake8x8-v0 : 1476/2000, Average time : 0.0004789602756500244s
Number of successes of Policy Iteration in FrozenLake8x8-v0 : 1436/2000, Average time : 0.0004445765018463135s


### Experiment with FrozenLake8x8-v0 

In [None]:
env = gym.make('FrozenLake8x8-v0')

vi_value = value_iteration(env, max_iters=MAX_ITERS, gamma=GAMMA)
policy_from_value = policy_extraction(env, vi_value, GAMMA)
start_vi = time.time()
vi_number_of_successes = play_multiple_times(env, policy=policy_from_value, max_episodes=MAX_EPISODES)
vi_time = time.time() - start_vi

pi = policy_iteration(env, max_iters=MAX_ITERS, gamma=GAMMA)
start_pi = time.time()
pi_number_of_successes = play_multiple_times(env, policy=pi, max_episodes=MAX_EPISODES)
pi_time = time.time() - start_pi

print(f'Number of successes of Value Iteration in FrozenLake8x8-v0 : {vi_number_of_successes}/{MAX_EPISODES}, Average time :  {vi_time/MAX_EPISODES}s')
print(f'Number of successes of Policy Iteration in FrozenLake8x8-v0 : {pi_number_of_successes}/{MAX_EPISODES}, Average time : {pi_time/MAX_EPISODES}s')

Converged at 117-th iteration.


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


Number of successes: 1508/2000
Converged at 3-th iteration.
Number of successes: 1492/2000
Number of successes of Value Iteration in FrozenLake8x8-v0 : 1508/2000, Average time :  0.0006862468719482422s
Number of successes of Policy Iteration in FrozenLake8x8-v0 : 1492/2000, Average time : 0.0008852777481079101s


### Experiment with Taxi-v3

In [None]:
env = gym.make('Taxi-v3')

vi_value = value_iteration(env, max_iters=MAX_ITERS, gamma=GAMMA)
policy_from_value = policy_extraction(env, vi_value, GAMMA)
start_vi = time.time()
vi_number_of_successes = play_multiple_times(env, policy=policy_from_value, max_episodes=MAX_EPISODES)
vi_time = time.time() - start_vi

pi = policy_iteration(env, max_iters=MAX_ITERS, gamma=GAMMA)
start_pi = time.time()
pi_number_of_successes = play_multiple_times(env, policy=pi, max_episodes=MAX_EPISODES)
pi_time = time.time() - start_pi

print(f'Number of successes of Value Iteration in FrozenLake8x8-v0 : {vi_number_of_successes}/{MAX_EPISODES}, Average time : {vi_time/MAX_EPISODES}s')
print(f'Number of successes of Policy Iteration in FrozenLake8x8-v0 : {pi_number_of_successes}/{MAX_EPISODES}, Average time : {pi_time/MAX_EPISODES}s')

Converged at 116-th iteration.


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


Number of successes: 2000/2000
Converged at 17-th iteration.
Number of successes: 2000/2000
Number of successes of Value Iteration in FrozenLake8x8-v0 : 2000/2000, Average time : 0.00013901913166046141s
Number of successes of Policy Iteration in FrozenLake8x8-v0 : 2000/2000, Average time : 0.00016633963584899902s


# Conclusion

From the experimental results, we can see that the results of the number of games won and the score of the 3 maps above with the two types of Value Iteration and Policy Iteration algorithms are quite different. The Policy Iteration algorithm converges faster than the Value Iteration algorithm and the running time is also faster, but the difference is not very large for 2000 EPISODES.

From an implementation perspective, Policy Iteration generally looks more 
complicated but runs faster than Value Iteration. Both of these algorithms guarantee that they will converge to an optimal strategy, but these two algorithms have some distinct characteristics in terms of algorithm implementation, computational cost, execution speed, ...





