### 카트폴 게임 마스터하기

In [None]:
import gym
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import matplotlib.pyplot as plt
from pyvirtualdisplay import Display

In [None]:
!apt-get install -y xvfb x11-utils

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libxxf86dga1
Suggested packages:
  mesa-utils
The following NEW packages will be installed:
  libxxf86dga1 x11-utils xvfb
0 upgraded, 3 newly installed, 0 to remove and 41 not upgraded.
Need to get 993 kB of archives.
After this operation, 2,982 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libxxf86dga1 amd64 2:1.1.4-1 [13.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 x11-utils amd64 7.7+3build1 [196 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.10 [784 kB]
Fetched 993 kB in 1s (1,519 kB/s)
Selecting previously unselected package libxxf86dga1:amd64.
(Reading database ... 155514 files and directories currently installed.)
Preparing to unpack .../libxxf86dga1_2%3a1.1.4-1_amd64.deb ...
Unpacking libxxf86dga1:amd64 (2

In [None]:
display = Display(visible=False, size=(400, 300))
display.start()

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '400x300x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '400x300x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [None]:
EPISODES = 50       # 에피소드 반복 횟수
EPS_START = 0.9     # 학습 시작 시 에이전트가 무작위로 행동할 확률
EPS_END = 0.05      # 학습 막바지에 에이전트가 무작위로 행동할 확률
EPS_DECAY = 200     # 학습 진행 시 에이전트가 무작위로 행동할 확률을 감소시키는 값
GAMMA = 0.8         # 할인계수
LR = 0.001
BATCH_SIZE = 64

In [None]:
class DQNAgent:
    def __init__(self):
        self.model = nn.Sequential(
            nn.Linear(4, 256),  # input: 카트 (위치, 속도), 막대기 (각도, 속도)
            nn.ReLU(),
            nn.Linear(256, 2)   # output: 왼쪽 or 오른쪽
        )
        self.optimizer = optim.Adam(self.model.parameters(), LR)
        self.steps_done = 0   # 학습 반복할 때마다 증가하는 변수
        self.memory = deque(maxlen=10000)   # self.memory = [(상태, 행동, 보상, 다음 상태) ......]
    
    def memorize(self, state, action, reward, next_state):
        self.memory.append((state,
                            action, torch.FloatTensor([reward]),
                            torch.FloatTensor([next_state])))
    
    def act(self, state):
        eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * self.steps_done / EPS_DECAY)
        self.steps_done += 1
        if random.random() > eps_threshold:
            return self.model(state).data.max(1)[1].view(1, 1)
        else:
            return torch.LongTensor([[random.randrange(2)]])

    def learn(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)  # 경험들 중 무작위로 가져옴 -> 경험샘플의 상관성 줄임
        states, actions, rewards, next_states = zip(*batch)

        states = torch.cat(states)
        actions = torch.cat(actions)
        rewards = torch.cat(rewards)
        next_states = torch.cat(next_states)

        current_q = self.model(states)

        max_next_q = self.model(next_states).detach().max(1)[0]
        expected_q = rewards + (GAMMA * max_next_q)

        loss = F.mse_loss(current_q.squeeze(), expected_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

$\epsilon \;\; threshold = \epsilon_{end} + ( \epsilon_{start} - \epsilon_{end})  * e^{-step/\epsilon_{decay}}$

In [None]:
env = gym.make("CartPole-v0")

agent = DQNAgent()
score_history = []

for e in range(1, EPISODES+1):
    state = env.reset()
    steps = 0

    while True:
        env.render()
        state = torch.FloatTensor([state])
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action.item())

        if done:
            reward = -1
        
        agent.memorize(state, action, reward, next_state)
        agent.learn()

        state = next_state
        steps += 1

        if done:
            print(f"에피소드: {e}, 점수: {steps}")
            score_history.append(steps)
            break