In [2]:
!pip3 install rl_util


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2[0m[39;49m -> [0m[32;49m22.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.9 -m pip install --upgrade pip[0m


In [3]:
import pandas as pd
import random
from queue import PriorityQueue
from rl_util.test import test_policy
from rl_util.value import QFunction
from rl_util.environment import MarkovEnv
from rl_util.policy import EpsSoftPolicyFromQ, GreedyPolicyFromQ
from rl_util.generator import simple_circle
import numpy as np

S = 'state'
A = 'action'
R = 'reward'
V = 'value'
G = 'return'
NS = 'next_step'

In [4]:
# Assume the environment is deterministic
class Model: 
    def __init__(self):
        self.t = pd.DataFrame()
    
    def add(self, state, action, reward, next_state):
        self.t = self.t.append({S: state, A: action, R: reward, NS: next_state}, ignore_index=True)
        
    def __from_pd(self, sample):
        if len(sample) == 0:
            return [None] * 4
        return (sample[S].values[0], sample[A].values[0], sample[R].values[0], sample[NS].values[0])
    
    def sample(self):
        return self.__from_pd(self.t.sample())
    
    def next_state(self, s, a):
        return self.__from_pd(self.t.loc[(self.t[S] == s) & (self.t[A] == a)])
    
    def prev_state(self, s, a):
        return self.__from_pd(self.t.loc[(self.t[NS] == s) & (self.t[A] == a)])
    
    def __len__(self):
        return self.t.shape[0]

# Dyna-Q

In [5]:
def dyna_q(n, alpha, phi, eps, env, iterations):
    q = QFunction(env)
    policy = EpsSoftPolicyFromQ(q.q, state_space=env.state_space(), action_space=env.action_space(), eps=eps)
    model = Model()
    for _ in range(iterations):
        state = env.reset()
        action = policy(state)
        done = False
        while not done:
            action = policy(state)
            next_state, reward, done = env.step(action)
            model.add(state, action, reward, next_state)
            if done:
                q_val_next = 0
            else:
                q_val_next = q.get_max(next_state)
            q.update(state, action, q(state, action) + alpha * (reward + phi * q_val_next - q(state, action)))
            
            state = next_state
            
            if len(model) < n:
                continue
            
            for _ in range(n):
                m_state, m_action, m_reward, m_next_state = model.sample()
                q_val_next = q.get_max(m_next_state)
                q.update(state, action, q(m_state, m_action) + alpha * (m_reward + phi * q_val_next - q(m_state, m_action)))
            
            policy = EpsSoftPolicyFromQ(q.q, state_space=env.state_space(), action_space=env.action_space(), eps=eps)
    return GreedyPolicyFromQ(q.q, state_space=env.state_space(), action_space=env.action_space()), q

# Prioritized sweeping for deterministic environment

In [6]:
def prioritized_sweeping(n, theta, alpha, phi, eps, env, iterations):
    p_queue = PriorityQueue()
    q = QFunction(env)
    policy = EpsSoftPolicyFromQ(q.q, state_space=env.state_space(), action_space=env.action_space(), eps=eps)
    model = Model()
    for _ in range(iterations):
        state = env.reset()
        action = policy(state)
        done = False
        while not done:
            action = policy(state)
            next_state, reward, done = env.step(action)
            model.add(state, action, reward, next_state)
            if done:
                q_val_next = 0
            else:
                q_val_next = q.q.loc[(q.q[S] == next_state)][V].max()
            
            p = abs(reward + phi * q_val_next - q(state, action))
            if p > theta:
                p_queue.put((p, state, action, reward, next_state))
            
            state = next_state

            for _ in range(n):
                if p_queue.empty():
                    break
                _, m_state, m_action, m_reward, m_next_state = p_queue.get()
                q_val_next = q.get_max(m_next_state)
                q.update(m_state, m_action, q(m_state, m_action) + alpha * (m_reward + phi * q_val_next - q(m_state, m_action)))
                for a in range(env.action_space()):
                    m_prev_state, m_action, m_reward, m_state = model.prev_state(m_state, a)
                    if m_prev_state is None:
                        continue
                    q_val_next = q.q.loc[(q.q[S] == m_state)][V].max()
                    q_val = q(m_prev_state, m_action)
                    p = abs(m_reward + phi * q_val_next - q_val)
                    if p > theta:
                        p_queue.put((p, state, action, reward, next_state))
            policy = EpsSoftPolicyFromQ(q.q, state_space=env.state_space(), action_space=env.action_space(), eps=eps)
    return GreedyPolicyFromQ(q.q, state_space=env.state_space(), action_space=env.action_space()), q

# Testing

In [7]:
env = simple_circle(state_space=5, action_space=2)
alpha = 0.5
phi = 0.99
eps = 0.5
iterations = 10
n = 5
theta = 0.5

In [8]:
env.transitions

Unnamed: 0,state,action,reward,next_state,probability
0,0.0,0.0,-3.0,1.0,1.0
1,0.0,1.0,-1.0,3.0,1.0
2,1.0,0.0,-1.0,2.0,1.0
3,1.0,1.0,-2.0,3.0,1.0
4,2.0,0.0,-3.0,3.0,1.0
5,2.0,1.0,-1.0,0.0,1.0
6,3.0,0.0,-2.0,4.0,1.0
7,3.0,1.0,-1.0,3.0,1.0


In [11]:
# Dyna-Q
dyna_policy, d_q = dyna_q(n, alpha, phi, eps, env, iterations)
test_policy(env, dyna_policy)

Finished in 2 steps, reward: -3.0


([0, 3, 4], -3.0, 2)

In [12]:
d_q.q

Unnamed: 0,state,action,value
0,0,0,-4.433679
1,0,1,-2.879243
2,1,0,-1.421435
3,1,1,-2.852941
4,2,0,-2.477914
5,2,1,-1.748909
6,3,0,-1.996963
7,3,1,-3.413464


In [13]:
# Prioritized sweeping
ps_policy, ps_q = prioritized_sweeping(n, theta, alpha, phi, eps, env, iterations)
test_policy(env, ps_policy)

Finished in 2 steps, reward: -3.0


([0, 3, 4], -3.0, 2)

In [15]:
ps_q.q

Unnamed: 0,state,action,value
0,0,0,-5.280193
1,0,1,-3.12
2,1,0,-6.642325
3,1,1,-4.016596
4,2,0,-1.301534
5,2,1,-1.248691
6,3,0,-2.458762
7,3,1,-3.494725
