# STEP 4 - Making DRL PySC2 Agent

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys; sys.path.append('..')

In [3]:
### unfortunately, PySC2 uses Abseil, which treats python code as if its run like an app
# This does not play well with jupyter notebook
# So we will need to monkeypatch sys.argv


import sys
#sys.argv = ["python", "--map", "AbyssalReef"]
sys.argv = ["python", "--map", "Simple64"]

## 0. Runnning 'Agent code' on jupyter notebook 

In [4]:


# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run an agent."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import importlib
import threading

from absl import app
from absl import flags
from future.builtins import range  # pylint: disable=redefined-builtin

from pysc2 import maps
from pysc2.env import available_actions_printer
from pysc2.env import run_loop
from pysc2.env import sc2_env
from pysc2.lib import point_flag
from pysc2.lib import stopwatch
from pysc2.lib import actions

FLAGS = flags.FLAGS

# because of Abseil's horrible design for running code underneath Colabs
# We have to pull out this ugly hack from the hat
if "flags_defined" not in globals():
    flags.DEFINE_bool("render", False, "Whether to render with pygame.")
    point_flag.DEFINE_point("feature_screen_size", "84",
                            "Resolution for screen feature layers.")
    point_flag.DEFINE_point("feature_minimap_size", "64",
                            "Resolution for minimap feature layers.")
    point_flag.DEFINE_point("rgb_screen_size", None,
                            "Resolution for rendered screen.")
    point_flag.DEFINE_point("rgb_minimap_size", None,
                            "Resolution for rendered minimap.")
    flags.DEFINE_enum("action_space", "RAW", sc2_env.ActionSpace._member_names_,  # pylint: disable=protected-access
                      "Which action space to use. Needed if you take both feature "
                      "and rgb observations.")
    flags.DEFINE_bool("use_feature_units", False,
                      "Whether to include feature units.")
    flags.DEFINE_bool("use_raw_units", True,
                      "Whether to include raw units.")
    flags.DEFINE_integer("raw_resolution", 64, "Raw Resolution.")
    flags.DEFINE_bool("disable_fog", True, "Whether to disable Fog of War.")

    flags.DEFINE_integer("max_agent_steps", 0, "Total agent steps.")
    flags.DEFINE_integer("game_steps_per_episode", None, "Game steps per episode.")
    flags.DEFINE_integer("max_episodes", 0, "Total episodes.")
    flags.DEFINE_integer("step_mul", 8, "Game steps per agent step.")
    flags.DEFINE_float("fps", 22.4, "Frames per second to run the game.")

    #flags.DEFINE_string("agent", "sc2.agent.BasicAgent.ZergBasicAgent",
    #                    "Which agent to run, as a python path to an Agent class.")
    #flags.DEFINE_enum("agent_race", "zerg", sc2_env.Race._member_names_,  # pylint: disable=protected-access
    #                  "Agent 1's race.")
    flags.DEFINE_string("agent", "TerranRLAgentWithRawActsAndRawObs",
                        "Which agent to run, as a python path to an Agent class.")
    flags.DEFINE_enum("agent_race", "terran", sc2_env.Race._member_names_,  # pylint: disable=protected-access
                      "Agent 1's race.")

    flags.DEFINE_string("agent2", "Bot", "Second agent, either Bot or agent class.")
    flags.DEFINE_enum("agent2_race", "random", sc2_env.Race._member_names_,  # pylint: disable=protected-access
                      "Agent 2's race.")
    flags.DEFINE_enum("difficulty", "medium", sc2_env.Difficulty._member_names_,  # pylint: disable=protected-access
                      "If agent2 is a built-in Bot, it's strength.")

    flags.DEFINE_bool("profile", False, "Whether to turn on code profiling.")
    flags.DEFINE_bool("trace", False, "Whether to trace the code execution.")
    flags.DEFINE_integer("parallel", 1, "How many instances to run in parallel.")

    flags.DEFINE_bool("save_replay", True, "Whether to save a replay at the end.")

    flags.DEFINE_string("map", None, "Name of a map to use.")
    flags.mark_flag_as_required("map")

flags_defined = True

def run_thread(agent_classes, players, map_name, visualize):
  """Run one thread worth of the environment with agents."""
  with sc2_env.SC2Env(
      map_name=map_name,
      players=players,
      agent_interface_format=sc2_env.parse_agent_interface_format(
        feature_screen=FLAGS.feature_screen_size,
        feature_minimap=FLAGS.feature_minimap_size,
        rgb_screen=FLAGS.rgb_screen_size,
        rgb_minimap=FLAGS.rgb_minimap_size,
        action_space=FLAGS.action_space,
        use_raw_units=FLAGS.use_raw_units,
        raw_resolution=FLAGS.raw_resolution),
      step_mul=FLAGS.step_mul,
      game_steps_per_episode=FLAGS.game_steps_per_episode,
      disable_fog=FLAGS.disable_fog,
      visualize=visualize) as env:
    #env = available_actions_printer.AvailableActionsPrinter(env)
    agents = [agent_cls() for agent_cls in agent_classes]
    run_loop.run_loop(agents, env, FLAGS.max_agent_steps, FLAGS.max_episodes)
    if FLAGS.save_replay:
      env.save_replay(agent_classes[0].__name__)

def main(unused_argv):
  """Run an agent."""
  #stopwatch.sw.enabled = FLAGS.profile or FLAGS.trace
  #stopwatch.sw.trace = FLAGS.trace

  map_inst = maps.get(FLAGS.map)

  agent_classes = []
  players = []

  #agent_module, agent_name = FLAGS.agent.rsplit(".", 1)
  #agent_cls = getattr(importlib.import_module(agent_module), agent_name)
  #agent_classes.append(agent_cls)
  agent_classes.append(TerranRLAgentWithRawActsAndRawObs)
  players.append(sc2_env.Agent(sc2_env.Race[FLAGS.agent_race]))

  if map_inst.players >= 2:
    if FLAGS.agent2 == "Bot":
      players.append(sc2_env.Bot(sc2_env.Race[FLAGS.agent2_race],
                                 sc2_env.Difficulty[FLAGS.difficulty]))
    else:
      #agent_module, agent_name = FLAGS.agent2.rsplit(".", 1)
      #agent_cls = getattr(importlib.import_module(agent_module), agent_name)
      agent_classes.append(TerranRandomAgent)
      players.append(sc2_env.Agent(sc2_env.Race[FLAGS.agent2_race]))

  threads = []
  for _ in range(FLAGS.parallel - 1):
    t = threading.Thread(target=run_thread,
                         args=(agent_classes, players, FLAGS.map, False))
    threads.append(t)
    t.start()

  run_thread(agent_classes, players, FLAGS.map, FLAGS.render)

  for t in threads:
    t.join()

  if FLAGS.profile:
    pass
    #print(stopwatch.sw)

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


## 1. Creating a PySC2 Agent with Raw Actions & Observations

![StarCraft2 PySC2 interfaces](./images/StarCraft2_PySC2_interfaces.png)

ref : https://on-demand.gputechconf.com/gtc/2018/presentation/s8739-machine-learning-with-starcraft-II.pdf

### < PySC2 Interfaces 3가지 종류 >

### 1st, Rendered
* Decomposed :
    - Screen, minimap, resources, available actions
* Same control as humans :
    - Pixel coordinates
    - Move camera
    - Select unit/rectangle
* Great for Deep Learning, but hard

### 2nd, Feature Layer
* Same actions : still in pixel space
* Same decomposed observations, but more abstract
    - Orthogonal camera 
* Layers:
    - unit type
    - unit owner
    - selection
    - health
    - unit density
    - etc
    
### 3rd, Raw
* List of units and state
* Control each unit individually in world coordinates
* Gives all observable state (no camera)
* Great for scripted agents and programmatic replay analysis

### < Raw Actions & Observations 을 사용하는 이유>
* Raw Actions & Observations 은 world cordinates를 사용하므로 전체 Map을 한번에 관찰하고 Camera를 이동하지 않고도 Map 상의 어느 곳에서도 Action을 취할 수 있는 새로운 형태의 Feature 이다.
* 이번 과정에 SL(Supervised Learning, 지도학습)을 활용한 학습은 없지만 스타크래프트 2 리플레이를 활용한 SL은 Raw Actions & Observations를 활용한 "programmatic replay analysis"가 필요하다.
* 인간 플레이어를 이긴 DeepMind의 AlphaStar의 주요 변경사항 중의 하나는 Raw Actions & Observations 의 활용이다.

### DRL 모델의 성능 추이를 보기위해 Reward의 평균 추이를 이용한다. 이때 단순이동평균 보다는 지수이동평균이 적절하다.

### 지수이동평균(EMA:Exponential Moving Average) 란?
지수이동평균(Exponential Moving Average)은 과거의 모든 기간을 계산대상으로 하며 최근의 데이타에 더 높은 가중치를 두는 일종의 가중이동평균법이다.

단순이동평균의 계산법에 비하여 원리가 복잡해 보이지만 실제로 이동평균을 산출하는 방법은 Previous Step의 지수이동평균값과 평활계수(smoothing constant) 그리고 당일의 가격만으로 구할 수 있으므로 Previous Step의 지수이동평균값만 구해진다면 오히려 간단한 편이다.

따라서 지수이동평균은 단순이동평균에 비해 몇가지 중요한 강점을 가진다.

첫째는 가장 최근의 Step에 가장 큰 가중치를 둠으로 해서 최근의 Episode들을 잘 반영한다는 점이고, 둘째는 단순이동평균에서와 같이 오래된 데이타를 갑자기 제외하지 않고 천천히 그 영향력을 사라지게 한다는 점이다.
또한 전 기간의 데이타를 분석대상으로 함으로써 가중이동평균에서 문제되는 특정 기간의 데이타만을 분석대상으로 한다는 단점도 보완하고 있다.

### 지수이동평균(EMA:Exponential Moving Average) 계산

지수이동평균은 가장 최근의 값에 많은 가중치를 부여하고 오래 된 값에는 적은 가중치를 부여한다. 비록 오래 된 값이라고 할지라도 완전히 무시하지는 않고 적게나마 반영시켜 계산한다는 장점이 있다. 단기 변동성을 포착하려는 것이 목적이다.

EMA=Previous Step 지수이동평균+(k∗(Current Step Reward − Previous Step 지수이동평균))


## 3. Applying Vanilla DQN to a PySC2 Agent

구현된 기능

- Implementing 'Experience Replay' : 
    - 'Maximization Bias' 문제를 발생시키는 원인 중 하나인 'Sample간의 시간적 연관성'을 해결하기 위한 방법
    - Online Learning 에서 Batch Learning 으로 학습방법 바뀜 : Online update 는 Batch update 보다 일반적으로 Validation loss 가 더 높게 나타남.
    - Reinforcement Learning for Robots. Using Neural Networks. Long -Ji Lin. January 6, 1993. 논문에서 최초로 연구됨 http://isl.anthropomatik.kit.edu/pdf/Lin1993.pdf

- Implementing 'Fixed Q-Target' : 
    - 'Moving Q-Target' 문제 해결하기 위한 방법
    - 2015년 Nature 버전 DQN 논문에서 처음 제안됨. https://deepmind.com/research/publications/human-level-control-through-deep-reinforcement-learning 


구현되지 않은 기능

- Implementing 'Sensory Input Feature-Extraction' :
    - 게임의 Raw Image 를 Neural Net에 넣기 위한 Preprocessing(전처리) 과정
    - Raw Image 의 Sequence중 '최근 4개의 이미지'(과거 정보)를 하나의 새로운 State로 정의하여 non-MDP를 MDP 문제로 바꾸는 Preprocessing 과정 
    - CNN(합성곱 신경망)을 활용한 '차원의 저주' 극복

In [7]:
import random
import time
import math
import os.path

import numpy as np
import pandas as pd
from collections import deque
import pickle

from pysc2.agents import base_agent
from pysc2.env import sc2_env
from pysc2.lib import actions, features, units
from absl import app

import torch
from torch.utils.tensorboard import SummaryWriter

from skdrl.pytorch.model.mlp import NaiveMultiLayerPerceptron
from skdrl.common.memory.memory import ExperienceReplayMemory

In [8]:
DATA_FILE_QNET = 'rlagent_with_vanilla_dqn_qnet'
DATA_FILE_QNET_TARGET = 'rlagent_with_vanilla_dqn_qnet_target'
SCORE_FILE = 'rlagent_with_vanilla_dqn_score'

scores = []                        # list containing scores from each episode
scores_window = deque(maxlen=100)  # last 100 scores

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
writer = SummaryWriter()

### Q-update 공식

#### 1. Online Q-learning
![Online Q-learning](./images/q-update-experience-replay.png)

#### 2. Online Q-learning with Function Approximation
![Online Q-learning with Function Approximation](./images/q-update-function-approximation.png)

#### 3. Batch Q-learning with Function Approximation & Experience Replay
![Batch Q-learning with Function Approximation & Experience Replay](./images/q-update-online.png)

### Moving target problem

#### 1. Function Approximation을 사용하지 않는 Q-learning 의 경우 : 특정한 Q(s,a) update가 다른 Q(s,a)에 영향을 주지 않는다.
![Moving target Q-learning](./images/moving-target_q-learing_case.png)

#### 2. Function Approximation을 사용하는 Q-learnig 의 경우 : 특정한 Q(s,a) update가 다른 Q(s,a)에 영향을 준다.
![Moving target Q-learning with Function Approximation](./images/moving-target_q-learing_with_function_approximation_case.png)

### Moving target 문제는 Deep Neural Network를 사용하는 Function Approximation 기법인 경우 심해지는 경향성이 있음.

image ref : Fast Campus RL online courese

### `nn.SmoothL1Loss()` = Huber loss 란?

Mean-squared Error (MSE) Loss 는 데이터의 outlier에 매우 취약하다.
어떤 이유로 타겟하는 레이블 y (이 경우는 q-learning target)이 noisy 할때를 가정하면, 잘못된 y 값을 맞추기 위해 파라미터들이 너무 sensitive 하게 움직이게 된다.

이런 현상은 q-learning 의 학습초기에 매우 빈번해 나타난다. 이러한 문제를 조금이라도 완화하기 위해서 outlier에 덜 민감한 Huber loss 함수를 사용한다.

### SmoothL1Loss (aka Huber loss)

$$loss(x,y) = \frac{1}{n}\sum_i z_i$$
$|x_i - y_i| <1$ 일때,
$$z_i = 0.5(x_i - y_i)^2$$
$|x_i - y_i| \geq1$ 일때,
$$z_i = |x_i - y_i|-0.5$$

ref : https://pytorch.org/docs/master/generated/torch.nn.SmoothL1Loss.html

In [9]:
import torch
import torch.nn as nn
import numpy as np
import random

class DQN(nn.Module):

    def __init__(self,
                 state_dim: int,
                 action_dim: int,
                 qnet: nn.Module,
                 qnet_target: nn.Module,
                 lr: float,
                 gamma: float,
                 epsilon: float):
        """
        :param state_dim: input state dimension
        :param action_dim: action dimension
        :param qnet: main q network
        :param qnet_target: target q network
        :param lr: learning rate
        :param gamma: discount factor of MDP
        :param epsilon: E-greedy factor
        """

        super(DQN, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.qnet = qnet
        self.lr = lr
        self.gamma = gamma
        self.opt = torch.optim.Adam(params=self.qnet.parameters(), lr=lr)
        self.register_buffer('epsilon', torch.ones(1) * epsilon)

        # target network related
        qnet_target.load_state_dict(qnet.state_dict())
        self.qnet_target = qnet_target
        self.criteria = nn.SmoothL1Loss()

    def choose_action(self, state):
        qs = self.qnet(state)
        #prob = np.random.uniform(0.0, 1.0, 1)
        #if torch.from_numpy(prob).float() <= self.epsilon:  # random
        if random.random() <= self.epsilon: # random
            action = np.random.choice(range(self.action_dim))
        else:  # greedy
            action = qs.argmax(dim=-1)
        return int(action)

    def learn(self, state, action, reward, next_state, done):
        s, a, r, ns = state, action, reward, next_state

        # compute Q-Learning target with 'target network'
        with torch.no_grad():
            q_max, _ = self.qnet_target(ns).max(dim=-1, keepdims=True)
            q_target = r + self.gamma * q_max * (1 - done)

        q_val = self.qnet(s).gather(1, a)
        loss = self.criteria(q_val, q_target)

        self.opt.zero_grad()
        loss.backward()
        self.opt.step()


def prepare_training_inputs(sampled_exps, device='cpu'):
    states = []
    actions = []
    rewards = []
    next_states = []
    dones = []
    for sampled_exp in sampled_exps:
        states.append(sampled_exp[0])
        actions.append(sampled_exp[1])
        rewards.append(sampled_exp[2])
        next_states.append(sampled_exp[3])
        dones.append(sampled_exp[4])

    states = torch.cat(states, dim=0).float().to(device)
    actions = torch.cat(actions, dim=0).to(device)
    rewards = torch.cat(rewards, dim=0).float().to(device)
    next_states = torch.cat(next_states, dim=0).float().to(device)
    dones = torch.cat(dones, dim=0).float().to(device)
    return states, actions, rewards, next_states, dones

In [26]:
class TerranAgentWithRawActsAndRawObs(base_agent.BaseAgent):
    # actions 추가 및 함수 정의(hirerachy하게)
    
    actions = ("do_nothing",
               "train_scv",
               "harvest_minerals",
               "harvest_gas",
               "build_commandcenter",
               
               "build_refinery",
               "build_supply_depot",
               "build_barracks",
               "train_marine",
               
               "build_factorys",
               "build_techlab_factorys",
               "build_tank",
               
               "attack",
               "attack_all"
              )
    
    def unit_type_is_selected(self, obs, unit_type):
        if (len(obs.observation.single_select) > 0 and
            obs.observation.single_select[0].unit_type == unit_type):
              return True

        if (len(obs.observation.multi_select) > 0 and
            obs.observation.multi_select[0].unit_type == unit_type):
              return True

        return False

    def get_my_units_by_type(self, obs, unit_type):
        if unit_type == units.Neutral.VespeneGeyser: # 가스 일 때만
            return [unit for unit in obs.observation.raw_units
                if unit.unit_type == unit_type]
        
        return [unit for unit in obs.observation.raw_units
                if unit.unit_type == unit_type
                and unit.alliance == features.PlayerRelative.SELF]

    def get_enemy_units_by_type(self, obs, unit_type):
        return [unit for unit in obs.observation.raw_units
                if unit.unit_type == unit_type
                and unit.alliance == features.PlayerRelative.ENEMY]

    def get_my_completed_units_by_type(self, obs, unit_type):
        return [unit for unit in obs.observation.raw_units
                if unit.unit_type == unit_type
                and unit.build_progress == 100
                and unit.alliance == features.PlayerRelative.SELF]

    def get_enemy_completed_units_by_type(self, obs, unit_type):
        return [unit for unit in obs.observation.raw_units
                if unit.unit_type == unit_type
                and unit.build_progress == 100
                and unit.alliance == features.PlayerRelative.ENEMY]

    def get_distances(self, obs, units, xy):
        units_xy = [(unit.x, unit.y) for unit in units]
        return np.linalg.norm(np.array(units_xy) - np.array(xy), axis=1)

    def step(self, obs):
        super(TerranAgentWithRawActsAndRawObs, self).step(obs)
        if obs.first():
            command_center = self.get_my_units_by_type(
                obs, units.Terran.CommandCenter)[0]
            self.base_top_left = (command_center.x < 32)
            self.top_left_gas_xy = [(14, 25), (21,19), (46,23), (39,16)]
            self.bottom_right_gas_xy = [(44, 43), (37,50), (12,46), (19,53)]
            

    def do_nothing(self, obs):
        return actions.RAW_FUNCTIONS.no_op()
    
    def train_scv(self, obs):
        completed_commandcenterses = self.get_my_completed_units_by_type(
            obs, units.Terran.CommandCenter)
        
        scvs = self.get_my_units_by_type(obs, units.Terran.SCV)
        
        if (len(completed_commandcenterses) > 0 and obs.observation.player.minerals >= 100
                and len(scvs) < 35):
            commandcenters = self.get_my_units_by_type(obs, units.Terran.CommandCenter)[0]
            if commandcenters.order_length < 5:
                return actions.RAW_FUNCTIONS.Train_SCV_quick("now", commandcenters.tag)
        return actions.RAW_FUNCTIONS.no_op()

    def harvest_minerals(self, obs):
        scvs = self.get_my_units_by_type(obs, units.Terran.SCV)
        idle_scvs = [scv for scv in scvs if scv.order_length == 0]
        if len(idle_scvs) > 0:
            mineral_patches = [unit for unit in obs.observation.raw_units
                               if unit.unit_type in [
                                   units.Neutral.BattleStationMineralField,
                                   units.Neutral.BattleStationMineralField750,
                                   units.Neutral.LabMineralField,
                                   units.Neutral.LabMineralField750,
                                   units.Neutral.MineralField,
                                   units.Neutral.MineralField750,
                                   units.Neutral.PurifierMineralField,
                                   units.Neutral.PurifierMineralField750,
                                   units.Neutral.PurifierRichMineralField,
                                   units.Neutral.PurifierRichMineralField750,
                                   units.Neutral.RichMineralField,
                                   units.Neutral.RichMineralField750
                               ]]
            scv = random.choice(idle_scvs)
            distances = self.get_distances(obs, mineral_patches, (scv.x, scv.y))
            mineral_patch = mineral_patches[np.argmin(distances)]
            return actions.RAW_FUNCTIONS.Harvest_Gather_unit(
                "now", scv.tag, mineral_patch.tag)
        return actions.RAW_FUNCTIONS.no_op()
    
    def harvest_gas(self, obs):
        scvs = self.get_my_units_by_type(obs, units.Terran.SCV)
        refs = self.get_my_units_by_type(obs, units.Terran.Refinery)
        
        refs = [refinery for refinery in refs if refinery.assigned_harvesters < 3]
        
        if refs:

            ref = refs[0]
            
            if len(scvs) > 0 and ref.ideal_harvesters:

                scv = random.choice(scvs)
                distances = self.get_distances(obs, refs, (scv.x, scv.y))

                ref = refs[np.argmin(distances)]

                return actions.RAW_FUNCTIONS.Harvest_Gather_unit(
                    "now", scv.tag, ref.tag)
        
        return actions.RAW_FUNCTIONS.no_op()
    
    def build_commandcenter(self,obs):
        commandcenters = self.get_my_units_by_type(obs,units.Terran.CommandCenter)
        scvs = self.get_my_units_by_type(obs, units.Terran.SCV)
        
        if ( len(commandcenters) < 2 and obs.observation.player.minerals >= 400 and
                len(scvs) > 0):
            ccs_xy = (41, 21) if self.base_top_left else (17, 48)
            
            distances = self.get_distances(obs, scvs, ccs_xy)
            scv = scvs[np.argmin(distances)]

            return actions.RAW_FUNCTIONS.Build_CommandCenter_pt(
                "now", scv.tag, ccs_xy)
        return actions.RAW_FUNCTIONS.no_op()
    
    ################################################################################################
    ####################################### refinery ###############################################
    
    def build_refinery(self,obs):
        refinerys = self.get_my_units_by_type(obs,units.Terran.Refinery)
        scvs = self.get_my_units_by_type(obs, units.Terran.SCV)
        
        if (len(refinerys) < 2 and obs.observation.player.minerals >= 100 and
                len(scvs) > 0):
            gas = self.get_my_units_by_type(obs, units.Neutral.VespeneGeyser)[0]
            
            if self.base_top_left:
                gases = self.top_left_gas_xy
            else:
                gases = self.bottom_right_gas_xy
            
            rc = np.random.choice([0,1,2,3])
            gas_xy = gases[rc]
            print("gas pos:", gas.x,gas.y)
            if (gas.x, gas.y) == gas_xy:
                distances = self.get_distances(obs, scvs, gas_xy)
                scv = scvs[np.argmin(distances)]

                return actions.RAW_FUNCTIONS.Build_Refinery_pt(
                    "now", scv.tag, gas.tag)
        return actions.RAW_FUNCTIONS.no_op()

    def build_supply_depot(self, obs):
        supply_depots = self.get_my_units_by_type(obs, units.Terran.SupplyDepot)
        scvs = self.get_my_units_by_type(obs, units.Terran.SCV)
        
        free_supply = (obs.observation.player.food_cap -
                       obs.observation.player.food_used)
        
        if (obs.observation.player.minerals >= 100 and
            len(scvs) > 0 and free_supply < 8):
            
            ccs = self.get_my_units_by_type(obs, units.Terran.CommandCenter)
            if ccs:
                for cc in ccs:
                    cc_x, cc_y = cc.x, cc.y
                
                rand1,rand2 = random.randint(-5,10),random.randint(-5,10)
                supply_depot_xy = (cc_x + rand1, cc_y + rand2) if self.base_top_left else (cc_x - rand1, cc_y - rand2)
                if 0 < supply_depot_xy[0] < 64 and 0 < supply_depot_xy[1] < 64:
                    pass
                else:
                    return actions.RAW_FUNCTIONS.no_op()
                    
                
                distances = self.get_distances(obs, scvs, supply_depot_xy)
                scv = scvs[np.argmin(distances)]
                
                return actions.RAW_FUNCTIONS.Build_SupplyDepot_pt(
                    "now", scv.tag, supply_depot_xy)
                
        return actions.RAW_FUNCTIONS.no_op()

    def build_barracks(self, obs):
        completed_supply_depots = self.get_my_completed_units_by_type(
            obs, units.Terran.SupplyDepot)
        barrackses = self.get_my_units_by_type(obs, units.Terran.Barracks)
        scvs = self.get_my_units_by_type(obs, units.Terran.SCV)
        if (len(completed_supply_depots) > 0 and
            obs.observation.player.minerals >= 150 and len(scvs) > 0 and
            len(barrackses)< 3):
            
            brks = self.get_my_units_by_type(obs, units.Terran.SupplyDepot)
            
            if brks:
                for brk in brks:
                    brk_x,brk_y = brk.x, brk.y
                

                rand1, rand2 = random.randint(1,3),random.randint(1,3)
                barracks_xy = (brk_x + rand1, brk_y + rand2) if self.base_top_left else (brk_x - rand1, brk_y - rand2)
                if 0 < barracks_xy[0] < 64 and 0 < barracks_xy[1] < 64:
                    pass
                else:
                    return actions.RAW_FUNCTIONS.no_op()
                    

                distances = self.get_distances(obs, scvs, barracks_xy)
                scv = scvs[np.argmin(distances)]
                return actions.RAW_FUNCTIONS.Build_Barracks_pt(
                    "now", scv.tag, barracks_xy)
        return actions.RAW_FUNCTIONS.no_op()

    def train_marine(self, obs):
        completed_barrackses = self.get_my_completed_units_by_type(
            obs, units.Terran.Barracks)
        free_supply = (obs.observation.player.food_cap -
                       obs.observation.player.food_used)
        if (len(completed_barrackses) > 0 and obs.observation.player.minerals >= 100
                and free_supply > 0):
            barracks = self.get_my_units_by_type(obs, units.Terran.Barracks)[0]
            if barracks.order_length < 5:
                return actions.RAW_FUNCTIONS.Train_Marine_quick("now", barracks.tag)
        return actions.RAW_FUNCTIONS.no_op()
    
    ###############################################################################################
    ###################################### Factorys ###############################################
    ###############################################################################################
    
    def build_factorys(self, obs):
        completed_barrackses = self.get_my_completed_units_by_type(
            obs, units.Terran.Barracks)
        
        barrackses = self.get_my_units_by_type(obs, units.Terran.Barracks)
        scvs = self.get_my_units_by_type(obs, units.Terran.SCV)
        # print("gas: ", obs.observation.player.minerals)
        # print("gas: ", obs.observation.player.gas)
        if (len(completed_barrackses) > 0 and
            obs.observation.player.minerals >= 200 and 
            len(scvs) > 0):
            
            brks = self.get_my_units_by_type(obs, units.Terran.SupplyDepot)
            
            if brks:
                for brk in brks:
                    brk_x,brk_y = brk.x, brk.y
                

                rand1, rand2 = random.randint(1,3),random.randint(1,3)
                factorys_xy = (brk_x + rand1, brk_y + rand2) if self.base_top_left else (brk_x - rand1, brk_y - rand2)
                if 0 < factorys_xy[0] < 64 and 0 < factorys_xy[1] < 64:
                    pass
                else:
                    return actions.RAW_FUNCTIONS.no_op()
                    

                distances = self.get_distances(obs, scvs, factorys_xy)
                scv = scvs[np.argmin(distances)]
                return actions.RAW_FUNCTIONS.Build_Factory_pt(
                    "now", scv.tag, factorys_xy)
        return actions.RAW_FUNCTIONS.no_op()
    
    def build_techlab_factorys(self, obs):
        completed_factorys = self.get_my_completed_units_by_type(
            obs, units.Terran.Factory)
        
        scvs = self.get_my_units_by_type(obs, units.Terran.SCV)
        
        if (len(completed_factorys) > 0 and 
            obs.observation.player.minerals >= 200):
            
            ftrs = self.get_my_units_by_type(obs, units.Terran.Factory)
            
            if ftrs:
                for ftr in ftrs:
                    ftr_x,ftr_y = ftr.x, ftr.y
            
                factorys_xy = (ftr_x,ftr_y)
                if 0 < factorys_xy[0] < 64 and 0 < factorys_xy[1] < 64:
                    pass
                else:
                    return actions.RAW_FUNCTIONS.no_op()

                return actions.RAW_FUNCTIONS.Build_TechLab_Factory_pt(
                    "now", ftr.tag, factorys_xy)
        return actions.RAW_FUNCTIONS.no_op()
    
    def build_tank(self, obs):
        completed_factorytechlab = self.get_my_completed_units_by_type(
            obs, units.Terran.FactoryTechLab)
        
        free_supply = (obs.observation.player.food_cap -
                       obs.observation.player.food_used)
        
        if (len(completed_factorytechlab) > 0 and obs.observation.player.minerals >= 200
                and free_supply > 3):
            
            factorys = self.get_my_units_by_type(obs, units.Terran.Factory)[0]
            
            if factorys.order_length < 5:
                return actions.RAW_FUNCTIONS.Train_SiegeTank_quick("now", factorys.tag)
        return actions.RAW_FUNCTIONS.no_op()
    
    
    ############################################################################################
    
   
    def attack(self, obs):
        marines = self.get_my_units_by_type(obs, units.Terran.Marine)
        if 20 < len(marines):
            print("marine수:", len(marines))
            
            flag = random.randint(0,2)
            if flag == 1:
                attack_xy = (38, 44) if self.base_top_left else (19, 23)
            else:
                attack_xy = (16, 45) if self.base_top_left else (42, 19)
            
            
            distances = self.get_distances(obs, marines, attack_xy)
            marine = marines[np.argmax(distances)]
            #marine = marines
            
            x_offset = random.randint(-5, 5)
            y_offset = random.randint(-5, 5)
            return actions.RAW_FUNCTIONS.Attack_pt(
                "now", marine.tag, (attack_xy[0] + x_offset, attack_xy[1] + y_offset))
        return actions.RAW_FUNCTIONS.no_op()
    
    def attack_all(self,obs):
        # 추가 유닛 생길 때 마다 추가
        marines = self.get_my_units_by_type(obs, units.Terran.Marine)
        tank = self.get_my_units_by_type(obs, units.Terran.SiegeTank)
        
        all_units = marines + tank
        
        if 20 < len(all_units):
            
            flag = random.randint(0,2)
            if flag == 1:
                attack_xy = (44, 50) if self.base_top_left else (14, 19)
            else:
                attack_xy = (16, 45) if self.base_top_left else (42, 19)
            
            x_offset = random.randint(-5, 5)
            y_offset = random.randint(-5, 5)
            
            all_tag = [unit.tag for unit in all_units]
            
            return actions.RAW_FUNCTIONS.Attack_pt(
                "now", all_tag, (attack_xy[0] + x_offset, attack_xy[1] + y_offset))
        return actions.RAW_FUNCTIONS.no_op()
            
    

In [27]:
class TerranRandomAgent(TerranAgentWithRawActsAndRawObs):
    def step(self, obs):
        super(TerranRandomAgent, self).step(obs)
        action = random.choice(self.actions)
        
        return getattr(self, action)(obs)

### Hyperparameter

하이퍼파라미터는 심층강화학습 알고리즘에서 성능에 매우 큰 영향을 미칩니다.
이 실험에 쓰인 하이퍼파라미터는 https://github.com/chucnorrisful/dqn 실험에서 제안된 값들을 참고하였습니다.


- self.s_dim = 21
- self.a_dim = 6

- self.lr = 1e-4 * 1
- self.batch_size = 32
- self.gamma = 0.99
- self.memory_size = 200000
- self.eps_max = 1.0
- self.eps_min = 0.01
- self.epsilon = 1.0
- self.init_sampling = 4000
- self.target_update_interval = 10

- self.epsilon = max(self.eps_min, self.eps_max - self.eps_min * (self.episode_count / 50))


![Winning rate graph](./images/rlagent_with_vanilla_dqn_score-Terran-Terran-495_Eps.png)

In [28]:
class TerranRLAgentWithRawActsAndRawObs(TerranAgentWithRawActsAndRawObs):
    def __init__(self):
        super(TerranRLAgentWithRawActsAndRawObs, self).__init__()

        self.s_dim = 21
        self.a_dim = 14
        
        self.lr = 1e-4 * 1
        self.batch_size = 32
        self.gamma = 0.99
        self.memory_size = 200000
        self.eps_max = 1.0
        self.eps_min = 0.01
        self.epsilon = 1.0
        self.init_sampling = 4000
        self.target_update_interval = 10

        self.data_file_qnet = DATA_FILE_QNET
        self.data_file_qnet_target = DATA_FILE_QNET_TARGET
        self.score_file = SCORE_FILE
        
        self.qnetwork = NaiveMultiLayerPerceptron(input_dim=self.s_dim,
                           output_dim=self.a_dim,
                           num_neurons=[128],
                           hidden_act_func='ReLU',
                           out_act_func='Identity').to(device)
        
        self.qnetwork_target = NaiveMultiLayerPerceptron(input_dim=self.s_dim,
                           output_dim=self.a_dim,
                           num_neurons=[128],
                           hidden_act_func='ReLU',
                           out_act_func='Identity').to(device)
        
        ############################################ qnet 로드하면 이전 모델이라 학습모델 인풋 아웃풋차원이 바뀜 #########
        #if os.path.isfile(self.data_file_qnet + '.pt'):
        #    self.qnetwork.load_state_dict(torch.load(self.data_file_qnet + '.pt'))
            
        #if os.path.isfile(self.data_file_qnet_target + '.pt'):
        #    self.qnetwork_target.load_state_dict(torch.load(self.data_file_qnet_target + '.pt'))
        
        # initialize target network same as the main network.
        self.qnetwork_target.load_state_dict(self.qnetwork.state_dict())

        self.dqn = DQN(state_dim=self.s_dim,
                             action_dim=self.a_dim,
                             qnet=self.qnetwork,
                             qnet_target=self.qnetwork_target,
                             lr=self.lr,
                             gamma=self.gamma,
                             epsilon=self.epsilon).to(device)
        
        self.memory = ExperienceReplayMemory(self.memory_size)
        
        self.print_every = 1
        self.cum_reward = 0
        self.cum_loss = 0
        self.episode_count = 0
        
        self.new_game()


    def reset(self):
        super(TerranRLAgentWithRawActsAndRawObs, self).reset()
        self.new_game()

    def new_game(self):
        self.base_top_left = None
        self.previous_state = None
        self.previous_action = None
        self.cum_reward = 0
        self.cum_loss = 0
        
        # epsilon scheduling
        # slowly decaying_epsilon
        self.epsilon = max(self.eps_min, self.eps_max - self.eps_min * (self.episode_count / 50))
        self.dqn.epsilon = torch.tensor(self.epsilon).to(device)
        

    def get_state(self, obs):
        scvs = self.get_my_units_by_type(obs, units.Terran.SCV)
        idle_scvs = [scv for scv in scvs if scv.order_length == 0]
        command_centers = self.get_my_units_by_type(obs, units.Terran.CommandCenter)
        supply_depots = self.get_my_units_by_type(obs, units.Terran.SupplyDepot)
        completed_supply_depots = self.get_my_completed_units_by_type(
            obs, units.Terran.SupplyDepot)
        barrackses = self.get_my_units_by_type(obs, units.Terran.Barracks)
        completed_barrackses = self.get_my_completed_units_by_type(
            obs, units.Terran.Barracks)
        marines = self.get_my_units_by_type(obs, units.Terran.Marine)

        queued_marines = (completed_barrackses[0].order_length
        if len(completed_barrackses) > 0 else 0)

        free_supply = (obs.observation.player.food_cap -
                       obs.observation.player.food_used)
        can_afford_supply_depot = obs.observation.player.minerals >= 100
        can_afford_barracks = obs.observation.player.minerals >= 150
        can_afford_marine = obs.observation.player.minerals >= 100

        enemy_scvs = self.get_enemy_units_by_type(obs, units.Terran.SCV)
        enemy_idle_scvs = [scv for scv in enemy_scvs if scv.order_length == 0]
        enemy_command_centers = self.get_enemy_units_by_type(
            obs, units.Terran.CommandCenter)
        enemy_supply_depots = self.get_enemy_units_by_type(
            obs, units.Terran.SupplyDepot)
        enemy_completed_supply_depots = self.get_enemy_completed_units_by_type(
            obs, units.Terran.SupplyDepot)
        enemy_barrackses = self.get_enemy_units_by_type(obs, units.Terran.Barracks)
        enemy_completed_barrackses = self.get_enemy_completed_units_by_type(
            obs, units.Terran.Barracks)
        enemy_marines = self.get_enemy_units_by_type(obs, units.Terran.Marine)

        return (len(command_centers),
                len(scvs),
                len(idle_scvs),
                len(supply_depots),
                len(completed_supply_depots),
                len(barrackses),
                len(completed_barrackses),
                len(marines),
                queued_marines,
                free_supply,
                can_afford_supply_depot,
                can_afford_barracks,
                can_afford_marine,
                len(enemy_command_centers),
                len(enemy_scvs),
                len(enemy_idle_scvs),
                len(enemy_supply_depots),
                len(enemy_completed_supply_depots),
                len(enemy_barrackses),
                len(enemy_completed_barrackses),
                len(enemy_marines))

    def step(self, obs):
        super(TerranRLAgentWithRawActsAndRawObs, self).step(obs)
        
        #time.sleep(0.5)
        
        state = self.get_state(obs)
        state = torch.tensor(state).float().view(1, self.s_dim).to(device)
        action_idx = self.dqn.choose_action(state)
        action = self.actions[action_idx]
        done = True if obs.last() else False

        if self.previous_action is not None:
            experience = (self.previous_state.to(device),
                          torch.tensor(self.previous_action).view(1, 1).to(device),
                          torch.tensor(obs.reward).view(1, 1).to(device),
                          state.to(device),
                          torch.tensor(done).view(1, 1).to(device))
            self.memory.push(experience)
        
        self.cum_reward += obs.reward
        self.previous_state = state
        self.previous_action = action_idx
        
        if obs.last():
            self.episode_count = self.episode_count + 1
            
            if len(self.memory) >= self.init_sampling:
                # training dqn
                sampled_exps = self.memory.sample(self.batch_size)
                sampled_exps = prepare_training_inputs(sampled_exps, device)
                self.dqn.learn(*sampled_exps)

            if self.episode_count % self.target_update_interval == 0:
                self.dqn.qnet_target.load_state_dict(self.dqn.qnet.state_dict())

            if self.episode_count % self.print_every == 0:
                msg = (self.episode_count, self.cum_reward, self.epsilon)
                print("Episode : {:4.0f} | Cumulative Reward : {:4.0f} | Epsilon : {:.3f}".format(*msg))
            
            torch.save(self.dqn.qnet.state_dict(), self.data_file_qnet + '.pt')
            torch.save(self.dqn.qnet_target.state_dict(), self.data_file_qnet_target + '.pt')

            scores_window.append(obs.reward)  # save most recent reward
            win_rate = scores_window.count(1)/len(scores_window)*100
            tie_rate = scores_window.count(0)/len(scores_window)*100
            lost_rate = scores_window.count(-1)/len(scores_window)*100
            
            scores.append([win_rate, tie_rate, lost_rate])  # save most recent score(win_rate, tie_rate, lost_rate)
            with open(self.score_file + '.txt', "wb") as fp:
                pickle.dump(scores, fp)
            
            #writer.add_scalar("Loss/train", self.cum_loss/obs.observation.game_loop, self.episode_count)
            writer.add_scalar("Score", self.cum_reward, self.episode_count)

        return getattr(self, action)(obs)

In [None]:
if __name__ == "__main__":
  app.run(main)

I0920 04:51:13.630007 4721393088 sc_process.py:135] Launching SC2: /Applications/StarCraft II/Versions/Base81102/SC2.app/Contents/MacOS/SC2 -listen 127.0.0.1 -port 24007 -dataDir /Applications/StarCraft II/ -tempDir /var/folders/r1/x6k135_915z463fc7lc4hkp40000gn/T/sc-xvf80ema/ -displayMode 0 -windowwidth 640 -windowheight 480 -windowx 50 -windowy 50
I0920 04:51:13.639935 4721393088 remote_controller.py:167] Connecting to: ws://127.0.0.1:24007/sc2api, attempt: 0, running: True
I0920 04:51:14.644796 4721393088 remote_controller.py:167] Connecting to: ws://127.0.0.1:24007/sc2api, attempt: 1, running: True
I0920 04:51:15.646541 4721393088 remote_controller.py:167] Connecting to: ws://127.0.0.1:24007/sc2api, attempt: 2, running: True
I0920 04:51:16.650418 4721393088 remote_controller.py:167] Connecting to: ws://127.0.0.1:24007/sc2api, attempt: 3, running: True
I0920 04:51:17.657387 4721393088 remote_controller.py:167] Connecting to: ws://127.0.0.1:24007/sc2api, attempt: 4, running: True
I09

gas pos: 19 53
gas pos: 19 53
gas pos: 37 50
gas pos: 46 23
gas pos: 39 16
gas pos: 39 16
gas pos: 37 50
gas pos: 12 46
gas pos: 21 19
gas pos: 39 16
gas pos: 21 19
gas pos: 12 46
gas pos: 19 53
gas pos: 21 19
gas pos: 44 43
gas pos: 12 46
gas pos: 21 19
gas pos: 46 23
gas pos: 46 23
gas pos: 12 46
gas pos: 19 53
gas pos: 21 19
gas pos: 46 23
gas pos: 19 53
gas pos: 46 23
gas pos: 37 50
gas pos: 39 16
gas pos: 46 23
gas pos: 21 19
gas pos: 46 23
gas pos: 39 16
gas pos: 44 43
gas pos: 12 46
gas pos: 46 23
gas pos: 44 43
gas pos: 44 43
gas pos: 21 19
gas pos: 44 43
gas pos: 39 16
gas pos: 37 50
gas pos: 39 16
gas pos: 14 25
gas pos: 19 53
gas pos: 44 43
marine수: 21
marine수: 22
marine수: 21


I0920 04:53:03.489943 4721393088 sc2_env.py:752] Environment Close
I0920 04:53:03.491039 4721393088 sc2_env.py:752] Environment Close
I0920 04:53:03.491779 4721393088 sc2_env.py:752] Environment Close


marine수: 22
marine수: 22
marine수: 22
marine수: 24
marine수: 23
marine수: 23
marine수: 23
marine수: 25
marine수: 26
marine수: 26
marine수: 26
marine수: 26
marine수: 26
marine수: 28
marine수: 29
marine수: 29
marine수: 31
marine수: 32
marine수: 32
marine수: 32
marine수: 32
marine수: 34
marine수: 35
marine수: 35
marine수: 34
marine수: 34
marine수: 34
marine수: 33
marine수: 35
marine수: 36
marine수: 38
marine수: 39


I0920 04:54:32.425792 4721393088 sc2_env.py:725] Episode 1 finished after 23536 game steps. Outcome: [1], reward: [1], score: [10228]


marine수: 39
Episode :    1 | Cumulative Reward :    1 | Epsilon : 1.000


I0920 04:54:37.888087 4721393088 sc2_env.py:507] Starting episode 2: [terran, random] on Simple64


gas pos: 37 50
gas pos: 12 46
gas pos: 14 25
gas pos: 37 50
gas pos: 14 25
gas pos: 19 53
gas pos: 39 16
gas pos: 21 19
gas pos: 14 25
gas pos: 21 19
gas pos: 46 23
gas pos: 14 25
gas pos: 46 23
gas pos: 46 23
gas pos: 19 53
gas pos: 44 43
gas pos: 21 19
gas pos: 21 19
gas pos: 39 16
gas pos: 46 23
gas pos: 12 46
gas pos: 37 50
gas pos: 37 50
gas pos: 39 16
gas pos: 46 23
gas pos: 39 16
gas pos: 44 43
gas pos: 37 50
gas pos: 37 50
gas pos: 21 19
gas pos: 19 53
gas pos: 19 53
gas pos: 21 19
gas pos: 14 25
gas pos: 19 53
marine수: 22
marine수: 22
marine수: 22
marine수: 22
marine수: 22
marine수: 23
marine수: 24
marine수: 24
marine수: 24
marine수: 27
marine수: 28
marine수: 28
marine수: 28
marine수: 28
marine수: 29
marine수: 29
marine수: 30
marine수: 32
marine수: 32
marine수: 33
marine수: 33
marine수: 33
marine수: 33
marine수: 34
marine수: 34
marine수: 34
marine수: 36
marine수: 36
marine수: 37
marine수: 39
marine수: 40
marine수: 41
marine수: 41
marine수: 42
marine수: 42
marine수: 42
marine수: 44
marine수: 45
marine수: 47
marine수

I0920 04:56:21.970433 4721393088 sc2_env.py:725] Episode 2 finished after 14720 game steps. Outcome: [1], reward: [1], score: [13534]


Episode :    2 | Cumulative Reward :    1 | Epsilon : 1.000


I0920 04:56:27.263149 4721393088 sc2_env.py:507] Starting episode 3: [terran, random] on Simple64


gas pos: 21 19
gas pos: 37 50
gas pos: 21 19
gas pos: 21 19
gas pos: 14 25
gas pos: 14 25


### [Winning rate graph]

In [18]:
!pip install matplotlib

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.5/bin/python3 -m pip install --upgrade pip' command.[0m


In [19]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

SCORE_FILE = 'rlagent_with_vanilla_dqn_score'

ModuleNotFoundError: No module named 'matplotlib'

In [13]:
with open(SCORE_FILE + '.txt', "rb") as fp:
    scores = pickle.load(fp)

In [14]:
np_scores = np.array(scores)
np_scores

array([[  0.        ,   0.        , 100.        ],
       [ 50.        ,   0.        ,  50.        ],
       [ 33.33333333,   0.        ,  66.66666667],
       [ 25.        ,  25.        ,  50.        ],
       [ 40.        ,  20.        ,  40.        ],
       [ 50.        ,  16.66666667,  33.33333333],
       [ 42.85714286,  14.28571429,  42.85714286],
       [ 37.5       ,  12.5       ,  50.        ],
       [ 33.33333333,  11.11111111,  55.55555556],
       [ 30.        ,  10.        ,  60.        ],
       [ 27.27272727,  18.18181818,  54.54545455]])

In [15]:
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(np_scores)), np_scores.T[0], color='r', label='win rate')
plt.plot(np.arange(len(np_scores)), np_scores.T[1], color='g', label='tie rate')
plt.plot(np.arange(len(np_scores)), np_scores.T[2], color='b', label='lose rate')
plt.ylabel('Score %')
plt.xlabel('Episode #')
plt.legend(loc='best')
plt.show()

NameError: name 'plt' is not defined