In [51]:
import numpy as np
import random

In [52]:
Node_count = 15
Normal_reward = -.02


In [53]:
class Environment():
    def __init__(self):
        self.state_value = np.full((Node_count + 1), 0)
        self.state_value_count = np.full((Node_count+1), 0)
        
    def reset(self):
        self.poisition = int((1 + Node_count) / 2)
        self.trajectory = [self.poisition]
    
    def get_states(self):
        return [i for i in range(1, Node_count + 1)]
    
    def get_actions(self):
        return [-1, 1]
    
    def is_done(self):
        return self.poisition == 1 or self.poisition == Node_count
    
    def step(self, action):
        if self.is_done():
            raise Exception('Game over')
            
        self.poisition += action
        self.trajectory.append(self.poisition)
        if self.poisition == 1:
            reward = -1
        elif self.poisition == Node_count:
            reward = 1
        else:
            reward = Normal_reward
        return self.poisition, reward
    
    def update_state_value(self, final_value):
        for i in range(len(self.trajectory) - 1, -1, -1):
            final_value += Normal_reward
            self.state_value[self.trajectory[i]] += final_value
            self.state_value_count[self.trajectory[i]] += 1
            
    def get_observation(self):
        mean1 = np.full((Node_count + 1), 0)
        for i in range(1, Node_count + 1):
            if self.state_value_count[i] == 0:
                mean1[i] = 0
                
            else:
                mean1[i] = self.state_value[i] / self.state_value_count[i]
        return mean1
            

In [54]:
class Agent():
    def __init__(self):
        pass
    
    def action(self, env):
        state_value = env.get_observation()
        
        if state_value[env.poisition - 1] > state_value[env.poisition + 1]:
            next_action = -1
        elif state_value[env.poisition-1] < state_value[env.poisition+1]: 
            next_action = 1
        else:
            next_action = random.choice(env.get_actions())
            
        return next_action

In [55]:
env = Environment()
agent = Agent()
total_reward_list = []
for i in range(10):
    env.reset()
    total_reward = 0
    action_count = 0
    while not env.is_done():
        action = agent.action(env)
        action_count += 1
        state, reward = env.step(action)
        
        total_reward += reward
        
        if action_count > 100:
            env.poision = int((1 + Node_count) / 2)
            break
            
    print(f'trajectory {i}: {env.trajectory}')
    
    if action_count <= 100:
        env.update_state_value(total_reward)
    total_reward_list.append(round(total_reward, 2))   
print(f"累計報酬: {total_reward_list}")
    

trajectory 0: [8, 9, 10, 9, 8, 9, 10, 9, 10, 11, 12, 11, 10, 9, 10, 9, 8, 9, 8, 7, 6, 7, 8, 7, 8, 9, 10, 11, 12, 13, 14, 15]
trajectory 1: [8, 9, 8, 9, 8, 9, 8, 9, 8, 7, 6, 7, 8, 9, 8, 9, 10, 9, 8, 7, 6, 7, 8, 9, 10, 11, 12, 11, 10, 11, 12, 13, 12, 13, 14, 13, 14, 13, 12, 13, 14, 13, 12, 11, 10, 9, 10, 11, 10, 11, 12, 13, 12, 11, 12, 13, 14, 13, 14, 15]
trajectory 2: [8, 9, 10, 9, 10, 9, 8, 7, 8, 7, 6, 5, 4, 5, 4, 3, 4, 3, 2, 1]
trajectory 3: [8, 9, 10, 11, 10, 11, 12, 13, 14, 13, 14, 13, 12, 11, 12, 11, 10, 9, 8, 7, 6, 7, 6, 7, 6, 7, 8, 9, 10, 9, 8, 9, 10, 9, 10, 11, 12, 13, 14, 13, 12, 13, 12, 11, 10, 11, 10, 9, 10, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 13, 12, 13, 14, 15]
trajectory 4: [8, 9, 10, 11, 12, 13, 14, 15]
trajectory 5: [8, 7, 8, 9, 10, 9, 8, 7, 8, 9, 8, 7, 8, 9, 10, 9, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 10, 11, 12, 13, 12, 11, 10, 11, 10, 11, 10, 9, 8, 7, 8, 7, 6, 7, 8, 9, 10, 11, 10, 11, 12, 11, 10, 9, 8, 7, 8, 9, 10, 11, 10, 11, 12, 13, 14, 13, 14, 13, 12,

In [56]:
env.get_observation()[1:]


array([-1, -1, -1, -1, -1,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0])