# Homework Assignment Week 2

This week’s homework assignment is to code either value iteration or policy iteration in a unique OpenAI Gym environment. Have it run until it converges to an optimal policy for whatever environment you use! Use this [Github](https://github.com/aaksham/frozenlake) link as a starting point.

Share your results on Twitter/Facebook/Instagram using the hashtag #move37 ! – Siraj

### import libraries

In [1]:
# import necessary libraries
import numpy as np
import gym
import time

### common functions

In [39]:
# Executes an episode
def execute(env, policy, gamma=0.1):
    start = env.reset()
    total_reward = 0
    step_index = 0
    while True:
        start, reward, done, _ = env.step(int(policy[start]))
        total_reward += (gamma ** step_index * reward)
        step_index += 1
        if done:
            break
    return total_reward


# Evaluates a policy by running it n times. returns: average total reward
def evaluatePolicy(env, policy, gamma=1.0, n=100):
    scores = [
        execute(env, policy, gamma=gamma)
        for _ in range(n)
    ]
    return np.mean(scores)


# Get Policy
def getPolicy(env, v, gamma=1.0):
    policy = np.zeros(env.env.nS)
    for s in range(env.env.nS):
        q_sa = np.zeros(env.env.nA)
        for a in range(env.env.nA):
            q_sa[a] = sum(
                [p * (r + gamma * v[s_])
                 for p, s_, r, _ in env.env.P[s][a]
                ]
            )
        policy[s] = np.argmax(q_sa)
    return policy

### value iteration algorithm

In [45]:
# Value Iteration Algorithm
def valueIteration(env, gamma=1.0):
    value = np.zeros(env.env.nS) # initialize value-function
    max_iterations = 10000
    eps = 1e-20
    
    for i in range(max_iterations):
        previous_value = np.copy(value)
        for s in range(env.env.nS):
            q_sa = [
                sum([p * (r + previous_value[s_])
                     for p, s_, r, _ in env.env.P[s][a]
                    ]
                )
                for a in range(env.env.nA)
            ]
            value[s] = max(q_sa)
        diff = np.sum(np.fabs(previous_value - value))
        if diff <= eps:
            print('Value-iteration converged at # {}.'.format(i + 1)) 
            break
    return value


# Run Value Iteration
def run_value_iteration(env, gamma=1.0):
    start_time = time.time()
    optimal_value = valueIteration(env, gamma=gamma)
    # choose the policy given a value-function
    policy = getPolicy(env, optimal_value, gamma=gamma)
    policy_score = evaluatePolicy(env, policy, gamma=gamma, n=1000)
    end_time = time.time()
    print("Best score = {:0.2f}. Time taken = {:4.4f} seconds".format(np.mean(policy_score), end_time - start_time))

### policy iteration algorithm

In [46]:
# iteratively calculates the Value-Function under Policy
def calculatePolicyValue(env, policy, gamma=1.0):
    value = np.zeros(env.env.nS)
    eps = 1e-10
    
    while True:
        previous_value = np.copy(value)
        for state in range(env.env.nS):
            policy_action = policy[state]
            value[state] = sum([p * (r + gamma * previous_value[s_])
                                for p, s_, r, _ in env.env.P[state][policy_action]]
                              )
        diff = np.sum(np.fabs(previous_value - value))
        if diff <= eps:
            # value converged
            break
    return value


# Policy Iteration Algorithm
def policyIteration(env, gamma=1.0):
    policy = np.random.choice(env.env.nA, size=(env.env.nS)) # initialize a random policy
    max_iterations = 1000
    
    for i in range(max_iterations):
        old_policy_value = calculatePolicyValue(env, policy, gamma)
        # extract the policy given a value-function
        new_policy = getPolicy(env, old_policy_value, gamma)
        if (np.all(policy == new_policy)):
            print('Policy Iteration converged at {}'.format(i + 1))
            break
        policy = new_policy
    return policy


# Run Policy Iteration
def run_policy_iteration(env, gamma=1.0):
    start_time = time.time()
    optimal_policy = policyIteration(env, gamma=gamma)
    scores = evaluatePolicy(env, optimal_policy, gamma=gamma)
    end_time = time.time()
    print("Best score = {:0.2f}. Time taken = {:4.4f} seconds".format(np.max(scores), end_time - start_time))

### main

In [22]:
env = gym.make('FrozenLake-v0')
gamma = 1.0

In [32]:
# run value iteration
run_value_iteration(env, gamma)

Value-iteration converged at # 1373.
Best score = 0.74. Time taken = 0.5040 seconds


In [33]:
# run policy iteration
run_policy_iteration(env, gamma)

Policy Iteration converged at 4
Best score = 0.76. Time taken = 0.1283 seconds


---
Created common function **getPolicy()** and edited both VI & PI Algorithms to utilize that function instead of **calculatePolicy** and **extractPolicy()** respectively.

In [47]:
env = gym.make('FrozenLake-v0')
gamma = 1.0

In [48]:
# run value iteration
run_value_iteration(env, gamma)

Value-iteration converged at # 1373.
Best score = 0.74. Time taken = 0.5069 seconds


In [54]:
# run policy iteration
run_policy_iteration(env, gamma)

Policy Iteration converged at 4
Best score = 0.76. Time taken = 0.1373 seconds
