# Predicting Rewards with the State-Value Function
https://rl-book.com/learn/mdp/state_value_function/

$$
V_{\pi}(s) = E_{\pi} [G|s] = E_{\pi} [\sum_{k=0}^{T} \gamma^kr_k|s]
$$

where G is the return, s is the state, γ is the discount factor, and r is the reward.



## Simple grid environment

In [12]:
starting_position = 1 # The starting position
cliff_position = 0 # The cliff position
end_position = 5 # The terminating state position
reward_goal_state = 5 # Reward for reaching goal
reward_cliff = 0 # Reward for falling off cliff

def reward(current_position) -> int:
    if current_position <= cliff_position:
        return reward_cliff
    if current_position >= end_position:
        return reward_goal_state
    return 0

def is_terminating(current_position) -> bool:
    if current_position <= cliff_position:
        return True
    if current_position >= end_position:
        return True
    return False



## Agent

In [34]:
import numpy as np

def strategy() -> int:
    if np.random.random() >= 0.5:
        return 1 # right
    else:
        return -1 # left

## Experiment

In [125]:
import numpy as np
np.random.seed(42)

# Global buffers to perform averaging later
value_sum = np.zeros(end_position + 1)
n_hits = np.zeros(end_position + 1)

n_iter = 10
for i in range(n_iter):
    position_history = [] # A log of positions in this episode
    current_position = starting_position # Reset
    while True:
        # Append position to log
        position_history.append(current_position)

        if is_terminating(current_position):
            break
        
        # Update current position according to strategy
        current_position += strategy()

    # Now the episode has finished, what was the reward?
    current_reward = reward(current_position)
    

    # Now add the reward to the buffers that allow you to calculate the average
    for pos in position_history:
        value_sum[pos] += current_reward
        n_hits[pos] += 1
    
    
    print("transitions:", position_history)
    print("overall reward for this simulation iteration:", current_reward)
    print("updated value function result:", value_sum)
    
    # Now calculate the average for this episode and print
    expected_return = ', '.join(f'{q:.2f}' for q in value_sum / n_hits)
    print("[{}] Average reward: [{}]".format(i, expected_return))
    

transitions: [1, 0]
overall reward for this simulation iteration: 0
updated value function result: [0. 0. 0. 0. 0. 0.]
[0] Average reward: [0.00, 0.00, nan, nan, nan, nan]
transitions: [1, 2, 3, 4, 3, 2, 1, 2, 3, 4, 3, 4, 5]
overall reward for this simulation iteration: 5
updated value function result: [ 0. 10. 15. 20. 15.  5.]
[1] Average reward: [0.00, 3.33, 5.00, 5.00, 5.00, 5.00]
transitions: [1, 0]
overall reward for this simulation iteration: 0
updated value function result: [ 0. 10. 15. 20. 15.  5.]
[2] Average reward: [0.00, 2.50, 5.00, 5.00, 5.00, 5.00]
transitions: [1, 0]
overall reward for this simulation iteration: 0
updated value function result: [ 0. 10. 15. 20. 15.  5.]
[3] Average reward: [0.00, 2.00, 5.00, 5.00, 5.00, 5.00]
transitions: [1, 0]
overall reward for this simulation iteration: 0
updated value function result: [ 0. 10. 15. 20. 15.  5.]
[4] Average reward: [0.00, 1.67, 5.00, 5.00, 5.00, 5.00]
transitions: [1, 0]
overall reward for this simulation iteration: 0

  expected_return = ', '.join(f'{q:.2f}' for q in value_sum / n_hits)
