In [130]:
import numpy as np

class Environment:
    def __init__(self, transition_probs, reward_func, initial_state=0, steps=3):
        # 状態遷移確率 _transition_probs[s,a,_s] = P(next_state=_s|state=s,action=a)
        assert len(transition_probs.shape) == 3
        self._transition_probs = transition_probs
        # 報酬 _reward_func[s,a] = E[r|state=s,action=a]
        assert len(reward_func.shape) == 2
        self._reward_func = reward_func
        
        self._num_states = transition_probs.shape[0]
        self._num_actions = transition_probs.shape[1]
        assert transition_probs.shape[2] == self._num_states
        assert reward_func.shape[0] == self._num_states and reward_func.shape[1] == self._num_actions
        
        assert initial_state < self._num_states
        self._initial_state = initial_state
        self._steps = steps
        self.reset()
        
    def reset(self):
        self._state = self._initial_state
        self._t = 0
        return self._state
        
    def step(self, action):
        assert action < self._num_actions
        probs = self._transition_probs[self._state, action]
        next_state = np.random.choice(self._num_states, p=probs/probs.sum())
        reward = self._reward_func[self._state, action]
        print('State: {} -> {}'.format(self._state, next_state))
        print('Reward: {}'.format(reward))
        self._state = next_state
        self._t += 1
        done = self._t >= self._steps
        return next_state, reward, done
    
# 損して得とる課題
T = np.zeros([3,2,3])
R = np.zeros([3,2])
T[0,0] = [0.,1.,0.]
R[0,0] = -1.
T[0,1] = [1.,0.,0.]
R[0,1] = +1.
T[1,0] = [0.,0.,1.]
R[1,0] = -1.
T[1,1] = [1.,0.,0.]
R[1,1] = +1.
T[2,0] = [0.,0.,1.]
R[2,0] = +5.
T[2,1] = [0.,1.,0.]
R[2,1] = +1.

env = Environment(T, R)

In [118]:
class Agent:
    def __init__(self, policy):
        # 状態遷移確率 _policy[s,a] = P(action=a|state=s)
        assert len(policy.shape) == 2
        self._policy = policy
        
        self._num_states = policy.shape[0]
        self._num_actions = policy.shape[1]
        
    def select_action(self, state):
        probs = self._policy[state]
        action = np.random.choice(self._num_actions, p=probs/probs.sum())
        return action
    
# ランダム行動
P = np.zeros([3,2])
P[0] = [.5,.5]
P[1] = [.5,.5]
P[2] = [.5,.5]
agent0 = Agent(P)

# 目先の利益
P = np.zeros([3,2])
P[0] = [0.,1.]
P[1] = [0.,1.]
P[2] = [0.,1.]
agent1 = Agent(P)

# サービス精神
P = np.zeros([3,2])
P[0] = [1.,0.]
P[1] = [1.,0.]
P[2] = [1.,0.]
agent2 = Agent(P)

In [125]:
agent1.select_action(0)

1

In [136]:
env = Environment(T, R, steps=1000)
agent = agent0
gamma = 0.9
gamma_prod = 1.

state = env.reset()
ret = 0
while True:
    action = agent.select_action(state)
    next_state, reward, done = env.step(action)
    ret += gamma_prod * reward
    gamma_prod *= gamma
    if done:
        break

print(ret)

State: 0 -> 1
Reward: -1.0
State: 1 -> 0
Reward: 1.0
State: 0 -> 0
Reward: 1.0
State: 0 -> 0
Reward: 1.0
State: 0 -> 0
Reward: 1.0
State: 0 -> 1
Reward: -1.0
State: 1 -> 0
Reward: 1.0
State: 0 -> 0
Reward: 1.0
State: 0 -> 0
Reward: 1.0
State: 0 -> 0
Reward: 1.0
State: 0 -> 0
Reward: 1.0
State: 0 -> 1
Reward: -1.0
State: 1 -> 0
Reward: 1.0
State: 0 -> 0
Reward: 1.0
State: 0 -> 1
Reward: -1.0
State: 1 -> 2
Reward: -1.0
State: 2 -> 1
Reward: 1.0
State: 1 -> 2
Reward: -1.0
State: 2 -> 1
Reward: 1.0
State: 1 -> 2
Reward: -1.0
State: 2 -> 2
Reward: 5.0
State: 2 -> 1
Reward: 1.0
State: 1 -> 2
Reward: -1.0
State: 2 -> 1
Reward: 1.0
State: 1 -> 0
Reward: 1.0
State: 0 -> 0
Reward: 1.0
State: 0 -> 1
Reward: -1.0
State: 1 -> 0
Reward: 1.0
State: 0 -> 1
Reward: -1.0
State: 1 -> 2
Reward: -1.0
State: 2 -> 2
Reward: 5.0
State: 2 -> 1
Reward: 1.0
State: 1 -> 0
Reward: 1.0
State: 0 -> 0
Reward: 1.0
State: 0 -> 1
Reward: -1.0
State: 1 -> 2
Reward: -1.0
State: 2 -> 2
Reward: 5.0
State: 2 -> 2
Reward: 5.0

In [135]:
-1-gamma+5*gamma**2/(1-gamma)

38.600000000000016