In [1]:
!pip3 install rl_util



In [2]:
import pandas as pd
import jax
import jax.numpy as jnp
import random
from rl_util.test import test_policy
from rl_util.value import QFunction
from rl_util.environment import MarkovEnv
from rl_util.policy import EpsSoftPolicy, EpsSoftPolicyFromQ
from rl_util.generator import simple_circle
import numpy as np

S = 'state'
A = 'action'
R = 'reward'
V = 'value'
G = 'return'

In [39]:
class EpsSoftPolicyFromQs(EpsSoftPolicy):
    def __init__(self, qs, state_space: int, action_space: int, eps: float):
        super().__init__(state_space, action_space, eps)
        self.qs = qs

    def update(self, s, a):
        raise Exception(':(')

    def p(self, a, s):
        best_a = None
        best_v = float('-inf')
        for a in range(self.action_space):
            cur_v = sum([q.loc[(q[S] == s) & (q[A] == a)][V].values[0] for q in self.qs])
            if cur_v > best_v:
                best_v = cur_v
                best_a = a
                
        if a == best_a:
            return 1 - self.eps + self.eps / self.action_space
        else:
            return self.eps / self.action_space

    def __call__(self, s):
        best_a = None
        best_v = float('-inf')
        for a in range(self.action_space):
            cur_v = sum([q.loc[(q[S] == s) & (q[A] == a)][V].values[0] for q in self.qs])
            if cur_v > best_v:
                best_v = cur_v
                best_a = a
        probs = [self.eps / self.action_space for _ in range(self.action_space)]
        probs[best_a] = 1 - self.eps + self.eps / self.action_space
        return random.choices(list(range(self.action_space)), probs, k=1)[0]

# SARSA (on-policy TD control)
state-action-reward-state-action

In [4]:
def sarsa(alpha, phi, eps, env, iterations):
    q = QFunction(env)
    policy = EpsSoftPolicyFromQ(q.q, state_space=env.state_space(), action_space=env.action_space(), eps=eps)
    for _ in range(iterations):
        state = env.reset()
        done = False
        action = policy(state)
        while not done:
            next_state, reward, done = env.step(action)
            q_val = q(state, action)
            if done:
                q_val_next = 0
                next_action = None
            else:
                next_action = policy(next_state)
                q_val_next = q(next_state, next_action)
            
            q.update(state, action, q_val + alpha * (reward + phi * q_val_next - q_val))
            state, action = next_state, next_action
            policy = EpsSoftPolicyFromQ(q.q, state_space=env.state_space(), action_space=env.action_space(), eps=eps)
    return policy, q

# Q-learning (off-policy TD control)

In [5]:
def q_learning(alpha, phi, eps, env, iterations):
    q = QFunction(env)
    policy = EpsSoftPolicyFromQ(q.q, state_space=env.state_space(), action_space=env.action_space(), eps=eps)
    for _ in range(iterations):
        state = env.reset()
        done = False
        while not done:
            action = policy(state)
            next_state, reward, done = env.step(action)
            
            q_val = q(state, action)

            if done:
                q_val_next = 0
            else:
                q_val_next = q.q.loc[(q.q[S] == next_state)][V].max()
            
            q.update(state, action, q_val + alpha * (reward + phi * q_val_next - q_val))
            state = next_state
            policy = EpsSoftPolicyFromQ(q.q, state_space=env.state_space(), action_space=env.action_space(), eps=eps)
    return policy, q

# Expected SARSA

In [16]:
def expected_sarsa(alpha, phi, eps, env, iterations):
    q = QFunction(env)
    policy = EpsSoftPolicyFromQ(q.q, state_space=env.state_space(), action_space=env.action_space(), eps=eps)
    for _ in range(iterations):
        state = env.reset()
        done = False
        while not done:
            action = policy(state)
            next_state, reward, done = env.step(action)
            
            q_val = q(state, action)
            
            if done:
                q_val_next = 0
            else:
                q_next = q.q.loc[(q.q[S] == next_state)]
                q_val_next = 0
                for (next_action, value) in zip(q_next[A], q_next[V]):
                    q_val_next += policy.p(next_action, state) * value
            
            q.update(state, action, q_val + alpha * (reward + phi * q_val_next - q_val))
            state = next_state
            policy = EpsSoftPolicyFromQ(q.q, state_space=env.state_space(), action_space=env.action_space(), eps=eps)
    return policy, q

# Double Q-learning

In [37]:
def double_q_learning(alpha, phi, eps, env, iterations):
    qs = [QFunction(env), QFunction(env)]
    policy = EpsSoftPolicyFromQs([q.q for q in qs], state_space=env.state_space(), action_space=env.action_space(), eps=eps)
    for _ in range(iterations):
        state = env.reset()
        done = False
        while not done:
            action = policy(state)
            next_state, reward, done = env.step(action)
            
            q_index = random.randint(0, 1)
            q1, q2 = qs[q_index], qs[1 - q_index]
            q_val = q1(state, action)
            
            if done:
                q_val_next = 0
            else:
                next_action = q1.q.iloc[q1.q.loc[(q1.q[S] == next_state)][V].idxmax()][A]
                q_val_next = q2(next_state, next_action)
            
            q1.update(state, action, q_val + alpha * (reward + phi * q_val_next - q_val))
            state = next_state
            policy = EpsSoftPolicyFromQs([q.q for q in qs], state_space=env.state_space(), action_space=env.action_space(), eps=eps)
    return policy, qs

# Tests

In [25]:
env = simple_circle(state_space=10, action_space=2)
alpha = 0.1
phi = 0.99
eps = 0.5
iterations = 100

In [26]:
env.transitions

Unnamed: 0,state,action,reward,next_state,probability
0,0.0,0.0,-3.0,1.0,1.0
1,0.0,1.0,-1.0,8.0,1.0
2,1.0,0.0,-2.0,2.0,1.0
3,1.0,1.0,-1.0,4.0,1.0
4,2.0,0.0,-2.0,3.0,1.0
5,2.0,1.0,-2.0,4.0,1.0
6,3.0,0.0,-3.0,4.0,1.0
7,3.0,1.0,-1.0,1.0,1.0
8,4.0,0.0,-2.0,5.0,1.0
9,4.0,1.0,-1.0,5.0,1.0


In [41]:
policy, q = sarsa(alpha, phi, eps, env, iterations)
test_policy(env, policy)

Finished in 2 steps, reward: -2.0


[0, 8, 9]

In [42]:
policy, q = q_learning(alpha, phi, eps, env, iterations)
test_policy(env, policy)

Finished in 7 steps, reward: -13.0


[0, 1, 2, 3, 1, 4, 5, 9]

In [43]:
policy, q = expected_sarsa(alpha, phi, eps, env, iterations)
test_policy(env, policy)

Finished in 5 steps, reward: -11.0


[0, 1, 2, 4, 5, 9]

In [40]:
policy, qs = double_q_learning(alpha, phi, eps, env, iterations)
test_policy(env, policy)

Finished in 2 steps, reward: -2.0


[0, 8, 9]