## 간단한 미로 문제 설정

In [4]:
import random

# 이산 행동/관찰 공간
class Discrete:
    def __init__(self, num_actions: int):
        self.n = num_actions
    def sample(self):
        return random.randint(0, self.n -1)
space = Discrete(4)
print(space.sample())

3


In [20]:
import os

# 환경 : agent, 목표, 행동 공간, 관찰 공간
class Environment:
    def __init__(self, *args, **kwargs):
        self.seeker, self.goal = (0,0), (4,4)
        self.info = {'seeker' : self.seeker, 'goal': self.goal}
        self.action_space = Discrete(4)
        self.observation_space = Discrete(5*5)
    
    def reset(self):
        # seeker의 위치를 초기화시키고 관찰을 반환
        self.seeker = (0,0)
        return self.get_observation()

    def get_observation(self):
        # seeker의 위치를 정수로 인코딩
        return 5*self.seeker[0] + self.seeker[1]

    def get_reward(self):
        # 목표 도달했으면 1, 아니라면 0
        return 1 if self.seeker == self.goal else 0

    def is_done(self):
        # 목표 도달시
        return self.seeker == self.goal

    def step(self, action):
        # 한 방향, 한걸음 이동한 뒤에 필요한 모든 정보를 반환
        if action == 0:  # move down
            self.seeker = (min(self.seeker[0] + 1, 4), self.seeker[1])
        elif action == 1:  # move left
            self.seeker = (self.seeker[0], max(self.seeker[1] - 1, 0))
        elif action == 2:  # move up
            self.seeker = (max(self.seeker[0] - 1, 0), self.seeker[1])
        elif action == 3:  # move right
            self.seeker = (self.seeker[0], min(self.seeker[1] + 1, 4))
        else:
            raise ValueError("Invalid action")

        obs = self.get_observation()
        rew = self.get_reward()
        done = self.is_done()
        return obs, rew, done, self.info
    
    def render(self, *args, **kwargs):
        # 환경 랜더링
        os.system('clear')
        grid = [['| ' for _ in range(5)] + ['|\n'] for _ in range(5)]
        grid[self.goal[0]][self.goal[1]] = '|G'
        grid[self.seeker[0]][self.seeker[1]] = '|S'
        print(''.join([''.join(grid_row) for grid_row in grid]))           

In [21]:
import time
environment = Environment()

while not environment.is_done():
    random_action = environment.action_space.sample()
    environment.step(random_action)
    time.sleep(1)
    environment.render()

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| |S| | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| | | | | |
| |S| | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| |S| | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J|S| | | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|

[H[2J| |S| | | |
| | | | | |
| | | | | |
| | | | | |
| | | | |G|



KeyboardInterrupt: 

## 시뮬레이션 구현

In [None]:


import numpy as np

class Policy:
    def __init__(self, env):
        self.state_action_table = [
            [0 for _ in range(env.action_space.n)]
             for _ in range(env.observation_space.n)
        ]
        self.action_space = env.action_space

    def get_action(self, state, explore=True, epsilon=0.1):
        # 무작위 탐색을 하거나 현재 사용 가능한 최고의 값을 활용
        if explore and random.uniform(0, 1) < epsilon:
            return self.action_space.sample()
        return np.argmax(self.state_action_table[state])

In [27]:
class Simulation(object):
    def __init__(self, env):
        # 정해진 정책으로 환경을 롤아웃해 시뮬레이션
        self.env = env
        
    def rollout(self, policy, render=False, explore=True, epsilon=0.1):
        # 정책을 롤아웃한 경험을 반환
        experiences = []
        state = self.env.reset()
        done = False
        while not done:
            action = policy.get_action(state, explore, epsilon)
            next_state, reward, done, info = self.env.step(action)
            experiences.append([state, action, reward, next_state])
            state = next_state
            if render:
                time.sleep(1)
                self.env.render()
        return experiences    

In [29]:
untrained_policy = Policy(environment)
sim = Simulation(environment)
exp = sim.rollout(untrained_policy, render=False, epsilon=1.0)
for row in untrained_policy.state_action_table:
    print(row)

[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]


## 강화학습 모델 훈련

In [32]:
def update_policy(policy, experiences, weight=0.1, discount_factor=0.9):
    # 주어진 정책을 experience 로 업데이트한다. 
    for state, action, reward, next_state in experiences:
        next_max = np.max(policy.state_action_table[next_state])
        value = policy.state_action_table[state][action] 
        new_value = (1-weight) * value + weight * (reward + discount_factor*next_max)
        policy.state_action_table[state][action] = new_value

In [35]:
def train_policy(env, num_episodes=10000, weight=0.1, discount_factor=0.9):
    # 롤아웃에서 얻은 경험을 통해 정책을 훈련한다. 
    policy = Policy(env)
    sim = Simulation(env)
    for _ in range(num_episodes):
        experiences = sim.rollout(policy)
        update_policy(policy, experiences, weight, discount_factor)
    return policy
trained_policy = train_policy(environment)
for row in trained_policy.state_action_table:
    print(row)

[0.4782968999999985, 0.4304672099999939, 0.43046720999996124, 0.38742048898349213]
[0.14753450961534276, 0.430467209999325, 0.2020971018157964, 0.012928799347924402]
[0.03045146291083148, 0.10499094828036047, 0, 0]
[0, 0, 0, 0]
[0, 0, 0, 0]
[0.5314409999999985, 0.47829689999992375, 0.4304672099999779, 0.5314409997675092]
[0.5904899999932254, 0.1958673635189995, 0.130143164164502, 0.059591141268447015]
[0.2957353091417675, 0, 0, 0.0]
[0, 0.023409017042862496, 0, 0]
[0, 0, 0, 0]
[0.5904899999999987, 0.5314409999996254, 0.4782968999977702, 0.5904899999986886]
[0.6560999999999991, 0.4329639097816381, 0.4376382523034612, 0.29399818132012334]
[0.693316248418272, 0.05904899999986285, 0, 0.0]
[0.5264436831214581, 0, 0, 0.0]
[0.0171, 0, 0, 0]
[0.6560999999999991, 0.590489999999807, 0.5314409999975317, 0.6560999999983582]
[0.7289999999999993, 0.537804865951427, 0.5370006555082681, 0.6899362522957431]
[0.8099999999999605, 0.3736704147584959, 0.35054094468874525, 0.5810332416882418]
[0.89999999999

In [42]:
def evaluate_policy(env, policy, num_episodes=10):
    # 롤아웃으로 훈련한 정책 평가
    simulation = Simulation(env)
    steps = 0

    for _ in range(num_episodes):
        experiences = simulation.rollout(policy, render=False, explore=False)
        steps += len(experiences)
        
    print(f"{steps / num_episodes} steps on average "
          f"for a total of {num_episodes} episodes.")

    return steps / num_episodes


evaluate_policy(environment, trained_policy)

8.0 steps on average for a total of 10 episodes.


8.0

## 레이 분산 애플리케이션 구축

In [43]:
import ray
ray.init()

2024-03-08 06:02:04,279	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


0,1
Python version:,3.10.13
Ray version:,2.2.0
Dashboard:,http://127.0.0.1:8266


In [47]:
@ray.remote
class SimulationActor(Simulation):
    def __init__(self):
        env = Environment()
        super().__init__(env)

def train_policy_parallel(env, num_episodes=100, num_simulations=4):
    # 병렬 정책 훈련 함수
    policy = Policy(env)
    simulations = [SimulationActor.remote() for _ in range(num_simulations)]

    policy_ref = ray.put(policy)
    for _ in range(num_episodes):
        exp_list = [sim.rollout.remote(policy_ref) for sim in simulations]
        while len(exp_list) > 0:
            finished, exp_list = ray.wait(exp_list)
            for exp in ray.get(finished):
                update_policy(policy, exp)
    return policy

parallel_policy = train_policy_parallel(environment)
evaluate_policy(environment, parallel_policy)

8.0 steps on average for a total of 10 episodes.


8.0