In [1]:
!pip3 install rl_util



In [2]:
import pandas as pd
import jax
import jax.numpy as jnp
import random
import rl_util
from rl_util.environment import MarkovEnv
from rl_util.policy import DeterministicPolicy, EpsSoftPolicy
from rl_util.generator import simple_circle
import random as std_rand

In [3]:
S = 'state'
A = 'action'
R = 'reward'
V = 'value'
G = 'return'

In [4]:
def random_episode_generator(episode_len=10, n_states=10, n_actions=3):
    def f(env, _):
        return [{S : std_rand.randrange(n_states), 
                 A : std_rand.randrange(n_actions), 
                 R : std_rand.gauss(0, 1)} for _ in range(episode_len)]
    return f


def acting_episode_generator(max_episode_len=10):
    def f(env, policy):
        episode = []
        state = env.reset()
        done = False
        t = 0
        while not done and t < max_episode_len:
            action = policy(state)
            next_state, reward, done = env.step(action)
            episode.append(
                {S: state, 
                 A: action, 
                 R: reward})
            state = next_state
            t += 1
        return episode
    return f

# On-policy Monte Carlo prediction

In [5]:
def on_policy_monte_carlo_prediction(policy, phi, env, iterations, episode_generator, first_visit=True):
    v = jnp.ones((len(env.states()), ))
    returns = [[] for _ in env.states()]
    
    for i in range(iterations):
        episode = episode_generator(env, policy)
        g = 0
        used_s = set()
        for T in range(len(episode)):
            for t in range(T - 1, -1, -1):
                g = phi * g + episode[t][R]
                s = episode[t][S]
                if not first_visit or s not in used_s:
                    returns[s].append(g)
                    v = v.at[s].set(sum(returns[s]) / len(returns[s]))
    return v

# On-policy Monte Carlo control with eps-soft policy

In [6]:
def on_policy_monte_carlo_control(phi, eps, env, iterations, episode_generator, first_visit=True):
    policy = EpsSoftPolicy(state_space=env.state_space(), action_space=env.action_space(), eps=eps)
    returns = pd.DataFrame({S : [], A : [], G : []})
    q = pd.DataFrame({S : [], A : [], V : []})
    for i in range(iterations): 
        episode = episode_generator(env, policy)
        
        first_appeared = {}
        for t in range(len(episode)):
            step = episode[t]
            state, action, reward = step[S], step[A], step[R]
            if (state, action) not in first_appeared:
                first_appeared[(state, action)] = t
        
        g = 0
        for t in range(len(episode) - 1, -1, -1):
            step = episode[t]
            state, action, reward = step[S], step[A], step[R]
            g = phi * g + reward
            if not first_visit or first_appeared.get((state, action), None) == t:
                returns = returns.append({S : state, A : action, G : g}, ignore_index=True)
                average_return = returns.loc[(returns[S] == state) & (returns[A] == action)][G].mean()
                if len(q.loc[(q[S] == state) & (q[A] == action)]) == 0:
                    q = q.append({S : state, A : action, V : average_return}, ignore_index=True)
                else:
                    q.loc[(q[S] == state) & (q[A] == action), V] = average_return
                a_max_idx = q.loc[q[S] == state][V].idxmax()
                a_best = q.iloc[a_max_idx][A]
                policy.update(state, a_best)
    return policy, q

# Off-policy Monte Carlo prediction

In [44]:
def off_policy_monte_carlo_prediction(b_policy, t_policy, phi, env, iterations, episode_generator):
    returns = pd.DataFrame({S : [], A : [], R : []})
    q = pd.DataFrame({S : [], A : [], V : []})
    c = pd.DataFrame({S : [], A : [], V : []})
    for _ in range(iterations):
        episode = episode_generator(env, policy)
        g = 0
        w = 1
        for t in range(len(episode) - 1, -1, -1):
            if w == 0:
                break
            step = episode[t]
            state, action, reward = step[S], step[A], step[R]
            g = phi * g + reward
            
            if len(c.loc[(c[S] == state) & (c[A] == action)]) == 0:
                c = c.append({S : state, A : action, V : w}, ignore_index=True)
            else:
                c.loc[(c[S] == state) & (c[A] == action), V] += w

            if len(q.loc[(q[S] == state) & (q[A] == action)]) == 0:
                q = q.append({S : state, A : action, V : 0}, ignore_index=True)
                
            cur_c = c.loc[(c[S] == state) & (c[A] == action)][V]
            cur_q = q.loc[(q[S] == state) & (q[A] == action)][V]
            
            q.loc[(q[S] == state) & (q[A] == action), V] = (cur_q + (g - cur_q) * w / cur_c).values[0]
            w = w * t_policy.p(action, state) / b_policy.p(action, state)
    return q

# Off-policy Monte Carlo control

In [35]:
def off_policy_monte_carlo_control(b_policy, phi, env, iterations, episode_generator):
    t_policy = DeterministicPolicy(state_space=env.state_space(), action_space=env.action_space())
    q = pd.DataFrame({S : [], A : [], V : []})
    c = pd.DataFrame({S : [], A : [], V : []})
    for _ in range(iterations):
        episode = episode_generator(env, policy)
        g = 0
        w = 1
        for t in range(len(episode) - 1, -1, -1):
            step = episode[t]
            state, action, reward = step[S], step[A], step[R]
            g = phi * g + reward
            
            if len(c.loc[(c[S] == state) & (c[A] == action)]) == 0:
                c = c.append({S : state, A : action, V : w}, ignore_index=True)
            else:
                c.loc[(c[S] == state) & (c[A] == action), V] += w

            if len(q.loc[(q[S] == state) & (q[A] == action)]) == 0:
                q = q.append({S : state, A : action, V : 0}, ignore_index=True)
                
            cur_c = c.loc[(c[S] == state) & (c[A] == action)][V]
            cur_q = q.loc[(q[S] == state) & (q[A] == action)][V]
            q.loc[(q[S] == state) & (q[A] == action)][V] = cur_q +  (g - cur_q) * w / cur_c
            
            a_max_idx = q.loc[q[S] == state][V].idxmax()
            a_best = q.iloc[a_max_idx][A]
            t_policy.update(state, a_best)
            
            if not action == a_best:
                break
                
            w = w * 1 / b_policy.p(action, state)
    return t_policy, q

# Testing

In [9]:
def test_policy(env, policy, max_steps=10):
    total_reward = 0
    done = False
    steps = 0
    state = env.reset()
    while not done and steps <= max_steps:
        action = policy(state)
        state, reward, done = env.step(action)
        total_reward += reward
        steps += 1
    print(f'Finished in {steps} steps, reward: {total_reward}')

In [20]:
action_space = 2
state_space = 10

phi = .99
eps = 0.5
iterations = 50

env = simple_circle(state_space=state_space, action_space=action_space)
policy, q = on_policy_monte_carlo_control(phi, eps, env, iterations, acting_episode_generator(), first_visit=True)

In [21]:
test_policy(env, policy, iterations)

Finished in 8 steps, reward: -15.0


In [22]:
on_policy_monte_carlo_prediction(policy, phi, env, iterations, acting_episode_generator(), first_visit=True)

DeviceArray([-16.67142 , -30.36178 , -36.140907, -33.573757, -37.87401 ,
             -37.681473, -35.706627, -18.78731 , -22.580856],            dtype=float32)

In [23]:
target_policy, q = off_policy_monte_carlo_control(policy, phi, env, iterations, acting_episode_generator())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q.loc[(q[S] == state) & (q[A] == action)][V] = cur_q + w / cur_c  * (g - cur_q)


In [24]:
test_policy(env, target_policy, iterations)

Finished in 3 steps, reward: -5.0


In [43]:
off_policy_monte_carlo_prediction(policy, target_policy, phi, env, 2, acting_episode_generator())

Unnamed: 0,state,action,value
0,8.0,0.0,-3.0
1,7.0,0.0,-3.97
2,0.0,1.0,-4.9303
3,7.0,1.0,-4.9303
