# STEP 3 - Making RL PySC2 Agent with sparse reward

In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# unfortunately, PySC2 uses Abseil, which treats python code as if its run like an app
# This does not play well with jupyter notebook
# So we will need to monkeypatch sys.argv


import sys
#sys.argv = ["python", "--map", "AbyssalReef"]
sys.argv = ["python", "--map", "Simple64"]

## 0. Runnning 'Agent code' on jupyter notebook 

In [7]:


# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run an agent."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import importlib
import threading

from absl import app
from absl import flags
from future.builtins import range  # pylint: disable=redefined-builtin

from pysc2 import maps
from pysc2.env import available_actions_printer
from pysc2.env import run_loop
from pysc2.env import sc2_env
from pysc2.lib import point_flag
from pysc2.lib import stopwatch

FLAGS = flags.FLAGS

# because of Abseil's horrible design for running code underneath Colabs
# We have to pull out this ugly hack from the hat
if "flags_defined" not in globals():
    flags.DEFINE_bool("render", False, "Whether to render with pygame.")
    point_flag.DEFINE_point("feature_screen_size", "84",
                            "Resolution for screen feature layers.")
    point_flag.DEFINE_point("feature_minimap_size", "64",
                            "Resolution for minimap feature layers.")
    point_flag.DEFINE_point("rgb_screen_size", None,
                            "Resolution for rendered screen.")
    point_flag.DEFINE_point("rgb_minimap_size", None,
                            "Resolution for rendered minimap.")
    flags.DEFINE_enum("action_space", None, sc2_env.ActionSpace._member_names_,  # pylint: disable=protected-access
                      "Which action space to use. Needed if you take both feature "
                      "and rgb observations.")
    flags.DEFINE_bool("use_feature_units", True,
                      "Whether to include feature units.")
    flags.DEFINE_bool("disable_fog", True, "Whether to disable Fog of War.")

    flags.DEFINE_integer("max_agent_steps", 0, "Total agent steps.")
    flags.DEFINE_integer("game_steps_per_episode", None, "Game steps per episode.")
    flags.DEFINE_integer("max_episodes", 0, "Total episodes.")
    flags.DEFINE_integer("step_mul", 8, "Game steps per agent step.")
    flags.DEFINE_float("fps", 22.4, "Frames per second to run the game.")

    #flags.DEFINE_string("agent", "sc2.agent.BasicAgent.ZergBasicAgent",
    #                    "Which agent to run, as a python path to an Agent class.")
    #flags.DEFINE_enum("agent_race", "zerg", sc2_env.Race._member_names_,  # pylint: disable=protected-access
    #                  "Agent 1's race.")
    flags.DEFINE_string("agent", "TerranSparseRewardRLAgent",
                        "Which agent to run, as a python path to an Agent class.")
    flags.DEFINE_enum("agent_race", "terran", sc2_env.Race._member_names_,  # pylint: disable=protected-access
                      "Agent 1's race.")

    flags.DEFINE_string("agent2", "Bot", "Second agent, either Bot or agent class.")
    flags.DEFINE_enum("agent2_race", "terran", sc2_env.Race._member_names_,  # pylint: disable=protected-access
                      "Agent 2's race.")
    flags.DEFINE_enum("difficulty", "very_easy", sc2_env.Difficulty._member_names_,  # pylint: disable=protected-access
                      "If agent2 is a built-in Bot, it's strength.")

    flags.DEFINE_bool("profile", False, "Whether to turn on code profiling.")
    flags.DEFINE_bool("trace", False, "Whether to trace the code execution.")
    flags.DEFINE_integer("parallel", 1, "How many instances to run in parallel.")

    flags.DEFINE_bool("save_replay", True, "Whether to save a replay at the end.")

    flags.DEFINE_string("map", None, "Name of a map to use.")
    flags.mark_flag_as_required("map")

flags_defined = True

def run_thread(agent_classes, players, map_name, visualize):
  """Run one thread worth of the environment with agents."""
  with sc2_env.SC2Env(
      map_name=map_name,
      players=players,
      agent_interface_format=sc2_env.parse_agent_interface_format(
          feature_screen=FLAGS.feature_screen_size,
          feature_minimap=FLAGS.feature_minimap_size,
          rgb_screen=FLAGS.rgb_screen_size,
          rgb_minimap=FLAGS.rgb_minimap_size,
          action_space=FLAGS.action_space,
          use_feature_units=FLAGS.use_feature_units),
      step_mul=FLAGS.step_mul,
      game_steps_per_episode=FLAGS.game_steps_per_episode,
      disable_fog=FLAGS.disable_fog,
      visualize=visualize) as env:
    env = available_actions_printer.AvailableActionsPrinter(env)
    agents = [agent_cls() for agent_cls in agent_classes]
    run_loop.run_loop(agents, env, FLAGS.max_agent_steps, FLAGS.max_episodes)
    if FLAGS.save_replay:
      env.save_replay(agent_classes[0].__name__)

def main(unused_argv):
  """Run an agent."""
  #stopwatch.sw.enabled = FLAGS.profile or FLAGS.trace
  #stopwatch.sw.trace = FLAGS.trace

  map_inst = maps.get(FLAGS.map)

  agent_classes = []
  players = []

  #agent_module, agent_name = FLAGS.agent.rsplit(".", 1)
  #agent_cls = getattr(importlib.import_module(agent_module), agent_name)
  #agent_classes.append(agent_cls)
  agent_classes.append(TerranSparseRewardRLAgent)
  players.append(sc2_env.Agent(sc2_env.Race[FLAGS.agent_race]))

  if map_inst.players >= 2:
    if FLAGS.agent2 == "Bot":
      players.append(sc2_env.Bot(sc2_env.Race[FLAGS.agent2_race],
                                 sc2_env.Difficulty[FLAGS.difficulty]))
    else:
      agent_module, agent_name = FLAGS.agent2.rsplit(".", 1)
      agent_cls = getattr(importlib.import_module(agent_module), agent_name)
      agent_classes.append(agent_cls)
      players.append(sc2_env.Agent(sc2_env.Race[FLAGS.agent2_race]))

  threads = []
  for _ in range(FLAGS.parallel - 1):
    t = threading.Thread(target=run_thread,
                         args=(agent_classes, players, FLAGS.map, False))
    threads.append(t)
    t.start()

  run_thread(agent_classes, players, FLAGS.map, FLAGS.render)

  for t in threads:
    t.join()

  if FLAGS.profile:
    pass
    #print(stopwatch.sw)

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


## 1. Creating a RL PySC2 Agent with Sparse Reward

In [8]:
import random
import time
import math
import os.path

import numpy as np
import pandas as pd


from pysc2.agents import base_agent
from pysc2.env import sc2_env
from pysc2.lib import actions, features, units
from absl import app

In [9]:
DATA_FILE = 'rlagent_with_sparse_reward_learning_data'

ACTION_DO_NOTHING = 'donothing'
ACTION_BUILD_SUPPLY_DEPOT = 'buildsupplydepot'
ACTION_BUILD_BARRACKS = 'buildbarracks'
ACTION_BUILD_MARINE = 'buildmarine'
ACTION_ATTACK = 'attack'

smart_actions = [
    ACTION_DO_NOTHING,
    ACTION_BUILD_SUPPLY_DEPOT,
    ACTION_BUILD_BARRACKS,
    ACTION_BUILD_MARINE,
]

for mm_x in range(0, 64):
    for mm_y in range(0, 64):
        if (mm_x + 1) % 32 == 0 and (mm_y + 1) % 32 == 0:
            smart_actions.append(ACTION_ATTACK + '_' + str(mm_x - 16) + '_' + str(mm_y - 16))

### QLearningTable 클래스 learn() 함수 수정

#### Sparse Reward를 처리하기 위해 Terminal State 일때만 Reward가 발생하는 상황을 고려함.

#### < AS-IS >
    ...
        q_predict = self.q_table.ix[s, a]
        q_target = r + self.gamma * self.q_table.ix[s_, :].max()

        # update
        self.q_table.ix[s, a] += self.lr * (q_target - q_predict)
    ...
    
#### < TO-BE >
    ...
        q_predict = self.q_table.ix[s, a]

        if s_ != 'terminal':
            q_target = r + self.gamma * self.q_table.loc[s_, self.q_table.columns[:]].max()
        else:
            q_target = r  # next state is terminal
            
        # update
        self.q_table.loc[s, a] += self.lr * (q_target - q_predict)
    ...

In [None]:
# reference from https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow
class QLearningTable:
    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
        self.actions = actions  # a list
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)

    def choose_action(self, observation):
        self.check_state_exist(observation)
        
        if np.random.uniform() < self.epsilon:
            # choose best action
            #state_action = self.q_table.ix[observation, :]
            state_action = self.q_table.loc[observation, self.q_table.columns[:]]
            
            # some actions have the same value
            state_action = state_action.reindex(np.random.permutation(state_action.index))
            
            action = state_action.idxmax()
        else:
            # choose random action
            action = np.random.choice(self.actions)
            
        return action

    def learn(self, s, a, r, s_):
        self.check_state_exist(s_)
        self.check_state_exist(s)
        
        #q_predict = self.q_table.ix[s, a]
        q_predict = self.q_table.loc[s, a]
        
        if s_ != 'terminal':
            #q_target = r + self.gamma * self.q_table.ix[s_, :].max()
            q_target = r + self.gamma * self.q_table.loc[s_, self.q_table.columns[:]].max()
        else:
            q_target = r  # next state is terminal
            
        # update
        #self.q_table.ix[s, a] += self.lr * (q_target - q_predict)
        self.q_table.loc[s, a] += self.lr * (q_target - q_predict)

    def check_state_exist(self, state):
        if state not in self.q_table.index:
            # append new state to q table
            self.q_table = self.q_table.append(pd.Series([0] * len(self.actions), index=self.q_table.columns, name=state))

In [None]:
class TerranSparseRewardRLAgent(base_agent.BaseAgent):
    def __init__(self):
        super(TerranSparseRewardRLAgent, self).__init__()
        
        self.qlearn = QLearningTable(actions=list(range(len(smart_actions))))
        
        self.previous_action = None
        self.previous_state = None
        
        self.cc_y = None
        self.cc_x = None
        
        self.move_number = 0
        
        if os.path.isfile(DATA_FILE + '.gz'):
            self.qlearn.q_table = pd.read_pickle(DATA_FILE + '.gz', compression='gzip')

    def transformDistance(self, x, x_distance, y, y_distance):
        if not self.base_top_left:
            return [x - x_distance, y - y_distance]
        
        return [x + x_distance, y + y_distance]
    
    def transformLocation(self, x, y):
        if not self.base_top_left:
            return [64 - x, 64 - y]
        
        return [x, y]
    
    def getMeanLocation(self, unitList):
        sum_x = 0
        sum_y = 0
        for unit in unitList:
            sum_x += unit.x
            sum_y += unit.y
        mean_x = sum_x / len(unitList)
        mean_y = sum_y / len(unitList)
        
        return [mean_x, mean_y]
    
    def splitAction(self, action_id):
        smart_action = smart_actions[action_id]
            
        x = 0
        y = 0
        if '_' in smart_action:
            smart_action, x, y = smart_action.split('_')

        return (smart_action, x, y)
    
    def unit_type_is_selected(self, obs, unit_type):
        if (len(obs.observation.single_select) > 0 and
            obs.observation.single_select[0].unit_type == unit_type):
              return True

        if (len(obs.observation.multi_select) > 0 and
            obs.observation.multi_select[0].unit_type == unit_type):
              return True

        return False

    def get_units_by_type(self, obs, unit_type):
        return [unit for unit in obs.observation.feature_units
                if unit.unit_type == unit_type]

    def can_do(self, obs, action):
        return action in obs.observation.available_actions
        
    def step(self, obs):
        super(TerranSparseRewardRLAgent, self).step(obs)

        #time.sleep(0.5)
        
        if obs.first():
            player_y, player_x = (obs.observation.feature_minimap.player_relative == features.PlayerRelative.SELF).nonzero()
            self.base_top_left = 1 if player_y.any() and player_y.mean() <= 31 else 0
            print("player_y: ", player_y)
            print("player_y.mean(): ", player_y.mean())
            print("base_top_left: ", self.base_top_left)
            print("smart_actions: ", smart_actions)
        
        ccs = self.get_units_by_type(obs, units.Terran.CommandCenter)
        if len(ccs) > 0:
            self.cc_x, self.cc_y = self.getMeanLocation(ccs)
            
        cc_count = len(ccs)
        
        supply_depot_count = len(self.get_units_by_type(obs, units.Terran.SupplyDepot))

        barracks_count = len(self.get_units_by_type(obs, units.Terran.Barracks))
            
        return actions.FUNCTIONS.no_op()

### [run code]

In [None]:
if __name__ == "__main__":
  app.run(main)

## 2. Adding 1st Step of Hierarchy Actions

### < Hierarchy Actions >
#### Action Space를 줄여서 Q-table의 크기를 줄여 학습을 빠르게 하기 위함.

* Do nothing — do nothing for 3 steps
* Build supply depot — select SCV, build supply depot, send SCV to harvest minerals
* Build barracks — select SCV, build barracks, send SCV to harvest minerals
* Build marine — select all barracks, train marine, do nothing
* Attack (x, y) — select army, attack coordinates, do nothing

In [None]:
import random
import time
import math
import os.path

import numpy as np
import pandas as pd


from pysc2.agents import base_agent
from pysc2.env import sc2_env
from pysc2.lib import actions, features, units
from absl import app

In [None]:
DATA_FILE = 'rlagent_with_sparse_reward_learning_data'

ACTION_DO_NOTHING = 'donothing'
ACTION_BUILD_SUPPLY_DEPOT = 'buildsupplydepot'
ACTION_BUILD_BARRACKS = 'buildbarracks'
ACTION_BUILD_MARINE = 'buildmarine'
ACTION_ATTACK = 'attack'

smart_actions = [
    ACTION_DO_NOTHING,
    ACTION_BUILD_SUPPLY_DEPOT,
    ACTION_BUILD_BARRACKS,
    ACTION_BUILD_MARINE,
]

for mm_x in range(0, 64):
    for mm_y in range(0, 64):
        if (mm_x + 1) % 32 == 0 and (mm_y + 1) % 32 == 0:
            smart_actions.append(ACTION_ATTACK + '_' + str(mm_x - 16) + '_' + str(mm_y - 16))

In [None]:
# reference from https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow
class QLearningTable:
    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
        self.actions = actions  # a list
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)

    def choose_action(self, observation):
        self.check_state_exist(observation)
        
        if np.random.uniform() < self.epsilon:
            # choose best action
            #state_action = self.q_table.ix[observation, :]
            state_action = self.q_table.loc[observation, self.q_table.columns[:]]
            
            # some actions have the same value
            state_action = state_action.reindex(np.random.permutation(state_action.index))
            
            action = state_action.idxmax()
        else:
            # choose random action
            action = np.random.choice(self.actions)
            
        return action

    def learn(self, s, a, r, s_):
        self.check_state_exist(s_)
        self.check_state_exist(s)
        
        #q_predict = self.q_table.ix[s, a]
        q_predict = self.q_table.loc[s, a]
        
        if s_ != 'terminal':
            #q_target = r + self.gamma * self.q_table.ix[s_, :].max()
            q_target = r + self.gamma * self.q_table.loc[s_, self.q_table.columns[:]].max()
        else:
            q_target = r  # next state is terminal
            
        # update
        #self.q_table.ix[s, a] += self.lr * (q_target - q_predict)
        self.q_table.loc[s, a] += self.lr * (q_target - q_predict)

    def check_state_exist(self, state):
        if state not in self.q_table.index:
            # append new state to q table
            self.q_table = self.q_table.append(pd.Series([0] * len(self.actions), index=self.q_table.columns, name=state))

In [None]:
class TerranSparseRewardRLAgent(base_agent.BaseAgent):
    def __init__(self):
        super(TerranSparseRewardRLAgent, self).__init__()
        
        self.qlearn = QLearningTable(actions=list(range(len(smart_actions))))
        
        self.previous_action = None
        self.previous_state = None
        
        self.cc_y = None
        self.cc_x = None
        
        self.move_number = 0
        
        if os.path.isfile(DATA_FILE + '.gz'):
            self.qlearn.q_table = pd.read_pickle(DATA_FILE + '.gz', compression='gzip')

    def transformDistance(self, x, x_distance, y, y_distance):
        if not self.base_top_left:
            return [x - x_distance, y - y_distance]
        
        return [x + x_distance, y + y_distance]
    
    def transformLocation(self, x, y):
        if not self.base_top_left:
            return [64 - x, 64 - y]
        
        return [x, y]
    
    def getMeanLocation(self, unitList):
        sum_x = 0
        sum_y = 0
        for unit in unitList:
            sum_x += unit.x
            sum_y += unit.y
        mean_x = sum_x / len(unitList)
        mean_y = sum_y / len(unitList)
        
        return [mean_x, mean_y]
    
    def splitAction(self, action_id):
        smart_action = smart_actions[action_id]
            
        x = 0
        y = 0
        if '_' in smart_action:
            smart_action, x, y = smart_action.split('_')

        return (smart_action, x, y)
    
    def unit_type_is_selected(self, obs, unit_type):
        if (len(obs.observation.single_select) > 0 and
            obs.observation.single_select[0].unit_type == unit_type):
              return True

        if (len(obs.observation.multi_select) > 0 and
            obs.observation.multi_select[0].unit_type == unit_type):
              return True

        return False

    def get_units_by_type(self, obs, unit_type):
        return [unit for unit in obs.observation.feature_units
                if unit.unit_type == unit_type]

    def can_do(self, obs, action):
        return action in obs.observation.available_actions
        
    def step(self, obs):
        super(TerranSparseRewardRLAgent, self).step(obs)

        #time.sleep(0.5)
        
        if obs.first():
            player_y, player_x = (obs.observation.feature_minimap.player_relative == features.PlayerRelative.SELF).nonzero()
            self.base_top_left = 1 if player_y.any() and player_y.mean() <= 31 else 0
        
        ccs = self.get_units_by_type(obs, units.Terran.CommandCenter)
        if len(ccs) > 0:
            self.cc_x, self.cc_y = self.getMeanLocation(ccs)
            
        cc_count = len(ccs)
        
        supply_depot_count = len(self.get_units_by_type(obs, units.Terran.SupplyDepot))

        barracks_count = len(self.get_units_by_type(obs, units.Terran.Barracks))

        army_supply = obs.observation.player.food_used
        
        if self.move_number == 0:
            self.move_number += 1
            
            current_state = np.zeros(8)
            current_state[0] = cc_count
            current_state[1] = supply_depot_count
            current_state[2] = barracks_count
            current_state[3] = army_supply
    
            hot_squares = np.zeros(4)        
            enemy_y, enemy_x = (obs.observation.feature_minimap.player_relative == features.PlayerRelative.ENEMY).nonzero()
            for i in range(0, len(enemy_y)):
                y = int(math.ceil((enemy_y[i] + 1) / 32))
                x = int(math.ceil((enemy_x[i] + 1) / 32))
                
                hot_squares[((y - 1) * 2) + (x - 1)] = 1
            
            if not self.base_top_left:
                hot_squares = hot_squares[::-1]
            
            for i in range(0, 4):
                current_state[i + 4] = hot_squares[i]
    
            if self.previous_action is not None:
                self.qlearn.learn(str(self.previous_state), self.previous_action, 0, str(current_state))
        
            rl_action = self.qlearn.choose_action(str(current_state))

            self.previous_state = current_state
            self.previous_action = rl_action
        
            smart_action, x, y = self.splitAction(self.previous_action)
            
            if smart_action == ACTION_BUILD_BARRACKS or smart_action == ACTION_BUILD_SUPPLY_DEPOT:
                if self.can_do(obs, actions.FUNCTIONS.select_point.id):
                    scvs = self.get_units_by_type(obs, units.Terran.SCV)
                    if len(scvs) > 0:
                        scv = random.choice(scvs)
                        if scv.x >= 0 and scv.y >= 0:
                            return actions.FUNCTIONS.select_point("select", (scv.x,
                                                                              scv.y))
                
            elif smart_action == ACTION_BUILD_MARINE:
                if self.can_do(obs, actions.FUNCTIONS.select_point.id):
                    barracks = self.get_units_by_type(obs, units.Terran.Barracks)
                    if len(barracks) > 0:
                        barrack = random.choice(barracks)
                        if barrack.x >= 0 and barrack.y >= 0:
                            return actions.FUNCTIONS.select_point("select_all_type", (barrack.x,
                                                                                  barrack.y))
                
            elif smart_action == ACTION_ATTACK:
                if self.can_do(obs, actions.FUNCTIONS.select_army.id):
                    return actions.FUNCTIONS.select_army("select")
        
        return actions.FUNCTIONS.no_op()

### [run code]

In [None]:
if __name__ == "__main__":
  app.run(main)

## 3. Adding 2nd Step of Hierarchy Actions

In [None]:
import random
import time
import math
import os.path

import numpy as np
import pandas as pd


from pysc2.agents import base_agent
from pysc2.env import sc2_env
from pysc2.lib import actions, features, units
from absl import app

In [None]:
DATA_FILE = 'rlagent_with_sparse_reward_learning_data'

ACTION_DO_NOTHING = 'donothing'
ACTION_BUILD_SUPPLY_DEPOT = 'buildsupplydepot'
ACTION_BUILD_BARRACKS = 'buildbarracks'
ACTION_BUILD_MARINE = 'buildmarine'
ACTION_ATTACK = 'attack'

smart_actions = [
    ACTION_DO_NOTHING,
    ACTION_BUILD_SUPPLY_DEPOT,
    ACTION_BUILD_BARRACKS,
    ACTION_BUILD_MARINE,
]

for mm_x in range(0, 64):
    for mm_y in range(0, 64):
        if (mm_x + 1) % 32 == 0 and (mm_y + 1) % 32 == 0:
            smart_actions.append(ACTION_ATTACK + '_' + str(mm_x - 16) + '_' + str(mm_y - 16))

In [None]:
# reference from https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow
class QLearningTable:
    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
        self.actions = actions  # a list
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)

    def choose_action(self, observation):
        self.check_state_exist(observation)
        
        if np.random.uniform() < self.epsilon:
            # choose best action
            #state_action = self.q_table.ix[observation, :]
            state_action = self.q_table.loc[observation, self.q_table.columns[:]]
            
            # some actions have the same value
            state_action = state_action.reindex(np.random.permutation(state_action.index))
            
            action = state_action.idxmax()
        else:
            # choose random action
            action = np.random.choice(self.actions)
            
        return action

    def learn(self, s, a, r, s_):
        self.check_state_exist(s_)
        self.check_state_exist(s)
        
        #q_predict = self.q_table.ix[s, a]
        q_predict = self.q_table.loc[s, a]
        
        if s_ != 'terminal':
            #q_target = r + self.gamma * self.q_table.ix[s_, :].max()
            q_target = r + self.gamma * self.q_table.loc[s_, self.q_table.columns[:]].max()
        else:
            q_target = r  # next state is terminal
            
        # update
        #self.q_table.ix[s, a] += self.lr * (q_target - q_predict)
        self.q_table.loc[s, a] += self.lr * (q_target - q_predict)

    def check_state_exist(self, state):
        if state not in self.q_table.index:
            # append new state to q table
            self.q_table = self.q_table.append(pd.Series([0] * len(self.actions), index=self.q_table.columns, name=state))

In [None]:
class TerranSparseRewardRLAgent(base_agent.BaseAgent):
    def __init__(self):
        super(TerranSparseRewardRLAgent, self).__init__()
        
        self.qlearn = QLearningTable(actions=list(range(len(smart_actions))))
        
        self.previous_action = None
        self.previous_state = None
        
        self.cc_y = None
        self.cc_x = None
        
        self.move_number = 0
        
        if os.path.isfile(DATA_FILE + '.gz'):
            self.qlearn.q_table = pd.read_pickle(DATA_FILE + '.gz', compression='gzip')

    def transformDistance(self, x, x_distance, y, y_distance):
        if not self.base_top_left:
            return [x - x_distance, y - y_distance]
        
        return [x + x_distance, y + y_distance]
    
    def transformLocation(self, x, y):
        if not self.base_top_left:
            return [64 - x, 64 - y]
        
        return [x, y]
    
    def getMeanLocation(self, unitList):
        sum_x = 0
        sum_y = 0
        for unit in unitList:
            sum_x += unit.x
            sum_y += unit.y
        mean_x = sum_x / len(unitList)
        mean_y = sum_y / len(unitList)
        
        return [mean_x, mean_y]
    
    def splitAction(self, action_id):
        smart_action = smart_actions[action_id]
            
        x = 0
        y = 0
        if '_' in smart_action:
            smart_action, x, y = smart_action.split('_')

        return (smart_action, x, y)
    
    def unit_type_is_selected(self, obs, unit_type):
        if (len(obs.observation.single_select) > 0 and
            obs.observation.single_select[0].unit_type == unit_type):
              return True

        if (len(obs.observation.multi_select) > 0 and
            obs.observation.multi_select[0].unit_type == unit_type):
              return True

        return False

    def get_units_by_type(self, obs, unit_type):
        return [unit for unit in obs.observation.feature_units
                if unit.unit_type == unit_type]

    def can_do(self, obs, action):
        return action in obs.observation.available_actions
        
    def step(self, obs):
        super(TerranSparseRewardRLAgent, self).step(obs)

        #time.sleep(0.5)
        
        if obs.first():
            player_y, player_x = (obs.observation.feature_minimap.player_relative == features.PlayerRelative.SELF).nonzero()
            self.base_top_left = 1 if player_y.any() and player_y.mean() <= 31 else 0
        
        ccs = self.get_units_by_type(obs, units.Terran.CommandCenter)
        if len(ccs) > 0:
            self.cc_x, self.cc_y = self.getMeanLocation(ccs)
            
        cc_count = len(ccs)
        
        supply_depot_count = len(self.get_units_by_type(obs, units.Terran.SupplyDepot))

        barracks_count = len(self.get_units_by_type(obs, units.Terran.Barracks))

        army_supply = obs.observation.player.food_used
        
        if self.move_number == 0:
            self.move_number += 1
            
            current_state = np.zeros(8)
            current_state[0] = cc_count
            current_state[1] = supply_depot_count
            current_state[2] = barracks_count
            current_state[3] = army_supply
    
            hot_squares = np.zeros(4)        
            enemy_y, enemy_x = (obs.observation.feature_minimap.player_relative == features.PlayerRelative.ENEMY).nonzero()
            for i in range(0, len(enemy_y)):
                y = int(math.ceil((enemy_y[i] + 1) / 32))
                x = int(math.ceil((enemy_x[i] + 1) / 32))
                
                hot_squares[((y - 1) * 2) + (x - 1)] = 1
            
            if not self.base_top_left:
                hot_squares = hot_squares[::-1]
            
            for i in range(0, 4):
                current_state[i + 4] = hot_squares[i]
    
            if self.previous_action is not None:
                self.qlearn.learn(str(self.previous_state), self.previous_action, 0, str(current_state))
        
            rl_action = self.qlearn.choose_action(str(current_state))

            self.previous_state = current_state
            self.previous_action = rl_action
        
            smart_action, x, y = self.splitAction(self.previous_action)
            
            if smart_action == ACTION_BUILD_BARRACKS or smart_action == ACTION_BUILD_SUPPLY_DEPOT:
                if self.can_do(obs, actions.FUNCTIONS.select_point.id):
                    scvs = self.get_units_by_type(obs, units.Terran.SCV)
                    if len(scvs) > 0:
                        scv = random.choice(scvs)
                        if scv.x >= 0 and scv.y >= 0:
                            return actions.FUNCTIONS.select_point("select", (scv.x,
                                                                              scv.y))
                
            elif smart_action == ACTION_BUILD_MARINE:
                if self.can_do(obs, actions.FUNCTIONS.select_point.id):
                    barracks = self.get_units_by_type(obs, units.Terran.Barracks)
                    if len(barracks) > 0:
                        barrack = random.choice(barracks)
                        if barrack.x >= 0 and barrack.y >= 0:
                            return actions.FUNCTIONS.select_point("select_all_type", (barrack.x,
                                                                                  barrack.y))
                
            elif smart_action == ACTION_ATTACK:
                if self.can_do(obs, actions.FUNCTIONS.select_army.id):
                    return actions.FUNCTIONS.select_army("select")
                
        elif self.move_number == 1:
            self.move_number += 1
            
            smart_action, x, y = self.splitAction(self.previous_action)
                
            if smart_action == ACTION_BUILD_SUPPLY_DEPOT:
                if supply_depot_count < 2 and self.can_do(obs, actions.FUNCTIONS.Build_SupplyDepot_screen.id):
                    if len(ccs) > 0:
                        if supply_depot_count == 0:
                            target = self.transformDistance(self.cc_x, -35, self.cc_y, 0)
                        elif supply_depot_count == 1:
                            target = self.transformDistance(self.cc_x, -25, self.cc_y, -25)
    
                        return actions.FUNCTIONS.Build_SupplyDepot_screen("now", target) 
            
            elif smart_action == ACTION_BUILD_BARRACKS:
                if barracks_count < 2 and self.can_do(obs, actions.FUNCTIONS.Build_Barracks_screen.id):
                    if len(ccs) > 0:
                        if  barracks_count == 0:
                            target = self.transformDistance(self.cc_x, 15, self.cc_y, -9)
                        elif  barracks_count == 1:
                            target = self.transformDistance(self.cc_x, 15, self.cc_y, 12)
    
                        return actions.FUNCTIONS.Build_Barracks_screen("now", target)
    
            elif smart_action == ACTION_BUILD_MARINE:
                if self.can_do(obs, actions.FUNCTIONS.Train_Marine_quick.id):
                    return actions.FUNCTIONS.Train_Marine_quick("queued")
        
            elif smart_action == ACTION_ATTACK:
                if self.can_do(obs, actions.FUNCTIONS.Attack_minimap.id):
                    x_offset = random.randint(-1, 1)
                    y_offset = random.randint(-1, 1)
                    
                    return actions.FUNCTIONS.Attack_minimap("now", self.transformLocation(int(x) + (x_offset * 8), int(y) + (y_offset * 8)))
            
        return actions.FUNCTIONS.no_op()

### [run code]

In [None]:
if __name__ == "__main__":
  app.run(main)

## 4. Adding 3rd Step of Hierarchy Actions

In [None]:
import random
import time
import math
import os.path

import numpy as np
import pandas as pd


from pysc2.agents import base_agent
from pysc2.env import sc2_env
from pysc2.lib import actions, features, units
from absl import app

In [None]:
DATA_FILE = 'rlagent_with_sparse_reward_learning_data'

ACTION_DO_NOTHING = 'donothing'
ACTION_BUILD_SUPPLY_DEPOT = 'buildsupplydepot'
ACTION_BUILD_BARRACKS = 'buildbarracks'
ACTION_BUILD_MARINE = 'buildmarine'
ACTION_ATTACK = 'attack'

smart_actions = [
    ACTION_DO_NOTHING,
    ACTION_BUILD_SUPPLY_DEPOT,
    ACTION_BUILD_BARRACKS,
    ACTION_BUILD_MARINE,
]

for mm_x in range(0, 64):
    for mm_y in range(0, 64):
        if (mm_x + 1) % 32 == 0 and (mm_y + 1) % 32 == 0:
            smart_actions.append(ACTION_ATTACK + '_' + str(mm_x - 16) + '_' + str(mm_y - 16))

In [None]:
# reference from https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow
class QLearningTable:
    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
        self.actions = actions  # a list
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)

    def choose_action(self, observation):
        self.check_state_exist(observation)
        
        if np.random.uniform() < self.epsilon:
            # choose best action
            #state_action = self.q_table.ix[observation, :]
            state_action = self.q_table.loc[observation, self.q_table.columns[:]]
            
            # some actions have the same value
            state_action = state_action.reindex(np.random.permutation(state_action.index))
            
            action = state_action.idxmax()
        else:
            # choose random action
            action = np.random.choice(self.actions)
            
        return action

    def learn(self, s, a, r, s_):
        self.check_state_exist(s_)
        self.check_state_exist(s)
        
        #q_predict = self.q_table.ix[s, a]
        q_predict = self.q_table.loc[s, a]
        
        if s_ != 'terminal':
            #q_target = r + self.gamma * self.q_table.ix[s_, :].max()
            q_target = r + self.gamma * self.q_table.loc[s_, self.q_table.columns[:]].max()
        else:
            q_target = r  # next state is terminal
            
        # update
        #self.q_table.ix[s, a] += self.lr * (q_target - q_predict)
        self.q_table.loc[s, a] += self.lr * (q_target - q_predict)

    def check_state_exist(self, state):
        if state not in self.q_table.index:
            # append new state to q table
            self.q_table = self.q_table.append(pd.Series([0] * len(self.actions), index=self.q_table.columns, name=state))

In [None]:
class TerranSparseRewardRLAgent(base_agent.BaseAgent):
    def __init__(self):
        super(TerranSparseRewardRLAgent, self).__init__()
        
        self.qlearn = QLearningTable(actions=list(range(len(smart_actions))))
        
        self.previous_action = None
        self.previous_state = None
        
        self.cc_y = None
        self.cc_x = None
        
        self.move_number = 0
        
        if os.path.isfile(DATA_FILE + '.gz'):
            self.qlearn.q_table = pd.read_pickle(DATA_FILE + '.gz', compression='gzip')

    def transformDistance(self, x, x_distance, y, y_distance):
        if not self.base_top_left:
            return [x - x_distance, y - y_distance]
        
        return [x + x_distance, y + y_distance]
    
    def transformLocation(self, x, y):
        if not self.base_top_left:
            return [64 - x, 64 - y]
        
        return [x, y]
    
    def getMeanLocation(self, unitList):
        sum_x = 0
        sum_y = 0
        for unit in unitList:
            sum_x += unit.x
            sum_y += unit.y
        mean_x = sum_x / len(unitList)
        mean_y = sum_y / len(unitList)
        
        return [mean_x, mean_y]
    
    def splitAction(self, action_id):
        smart_action = smart_actions[action_id]
            
        x = 0
        y = 0
        if '_' in smart_action:
            smart_action, x, y = smart_action.split('_')

        return (smart_action, x, y)
    
    def unit_type_is_selected(self, obs, unit_type):
        if (len(obs.observation.single_select) > 0 and
            obs.observation.single_select[0].unit_type == unit_type):
              return True

        if (len(obs.observation.multi_select) > 0 and
            obs.observation.multi_select[0].unit_type == unit_type):
              return True

        return False

    def get_units_by_type(self, obs, unit_type):
        return [unit for unit in obs.observation.feature_units
                if unit.unit_type == unit_type]

    def can_do(self, obs, action):
        return action in obs.observation.available_actions
        
    def step(self, obs):
        super(TerranSparseRewardRLAgent, self).step(obs)

        #time.sleep(0.5)
        
        if obs.first():
            player_y, player_x = (obs.observation.feature_minimap.player_relative == features.PlayerRelative.SELF).nonzero()
            self.base_top_left = 1 if player_y.any() and player_y.mean() <= 31 else 0
        
        ccs = self.get_units_by_type(obs, units.Terran.CommandCenter)
        if len(ccs) > 0:
            self.cc_x, self.cc_y = self.getMeanLocation(ccs)
            
        cc_count = len(ccs)
        
        supply_depot_count = len(self.get_units_by_type(obs, units.Terran.SupplyDepot))

        barracks_count = len(self.get_units_by_type(obs, units.Terran.Barracks))

        army_supply = obs.observation.player.food_used
        
        if self.move_number == 0:
            self.move_number += 1
            
            current_state = np.zeros(8)
            current_state[0] = cc_count
            current_state[1] = supply_depot_count
            current_state[2] = barracks_count
            current_state[3] = army_supply
    
            hot_squares = np.zeros(4)        
            enemy_y, enemy_x = (obs.observation.feature_minimap.player_relative == features.PlayerRelative.ENEMY).nonzero()
            for i in range(0, len(enemy_y)):
                y = int(math.ceil((enemy_y[i] + 1) / 32))
                x = int(math.ceil((enemy_x[i] + 1) / 32))
                
                hot_squares[((y - 1) * 2) + (x - 1)] = 1
            
            if not self.base_top_left:
                hot_squares = hot_squares[::-1]
            
            for i in range(0, 4):
                current_state[i + 4] = hot_squares[i]
    
            if self.previous_action is not None:
                self.qlearn.learn(str(self.previous_state), self.previous_action, 0, str(current_state))
        
            rl_action = self.qlearn.choose_action(str(current_state))

            self.previous_state = current_state
            self.previous_action = rl_action
        
            smart_action, x, y = self.splitAction(self.previous_action)
            
            if smart_action == ACTION_BUILD_BARRACKS or smart_action == ACTION_BUILD_SUPPLY_DEPOT:
                if self.can_do(obs, actions.FUNCTIONS.select_point.id):
                    scvs = self.get_units_by_type(obs, units.Terran.SCV)
                    if len(scvs) > 0:
                        scv = random.choice(scvs)
                        if scv.x >= 0 and scv.y >= 0:
                            return actions.FUNCTIONS.select_point("select", (scv.x,
                                                                              scv.y))
                
            elif smart_action == ACTION_BUILD_MARINE:
                if self.can_do(obs, actions.FUNCTIONS.select_point.id):
                    barracks = self.get_units_by_type(obs, units.Terran.Barracks)
                    if len(barracks) > 0:
                        barrack = random.choice(barracks)
                        if barrack.x >= 0 and barrack.y >= 0:
                            return actions.FUNCTIONS.select_point("select_all_type", (barrack.x,
                                                                                  barrack.y))
                
            elif smart_action == ACTION_ATTACK:
                if self.can_do(obs, actions.FUNCTIONS.select_army.id):
                    return actions.FUNCTIONS.select_army("select")
                
        elif self.move_number == 1:
            self.move_number += 1
            
            smart_action, x, y = self.splitAction(self.previous_action)
                
            if smart_action == ACTION_BUILD_SUPPLY_DEPOT:
                if supply_depot_count < 2 and self.can_do(obs, actions.FUNCTIONS.Build_SupplyDepot_screen.id):
                    if len(ccs) > 0:
                        if supply_depot_count == 0:
                            target = self.transformDistance(self.cc_x, -35, self.cc_y, 0)
                        elif supply_depot_count == 1:
                            target = self.transformDistance(self.cc_x, -25, self.cc_y, -25)
    
                        return actions.FUNCTIONS.Build_SupplyDepot_screen("now", target) 
            
            elif smart_action == ACTION_BUILD_BARRACKS:
                if barracks_count < 2 and self.can_do(obs, actions.FUNCTIONS.Build_Barracks_screen.id):
                    if len(ccs) > 0:
                        if  barracks_count == 0:
                            target = self.transformDistance(self.cc_x, 15, self.cc_y, -9)
                        elif  barracks_count == 1:
                            target = self.transformDistance(self.cc_x, 15, self.cc_y, 12)
    
                        return actions.FUNCTIONS.Build_Barracks_screen("now", target)
    
            elif smart_action == ACTION_BUILD_MARINE:
                if self.can_do(obs, actions.FUNCTIONS.Train_Marine_quick.id):
                    return actions.FUNCTIONS.Train_Marine_quick("queued")
        
            elif smart_action == ACTION_ATTACK:
                if self.can_do(obs, actions.FUNCTIONS.Attack_minimap.id):
                    x_offset = random.randint(-1, 1)
                    y_offset = random.randint(-1, 1)
                    
                    return actions.FUNCTIONS.Attack_minimap("now", self.transformLocation(int(x) + (x_offset * 8), int(y) + (y_offset * 8)))
            
        elif self.move_number == 2:
            self.move_number = 0
            
            smart_action, x, y = self.splitAction(self.previous_action)
                
            if smart_action == ACTION_BUILD_BARRACKS or smart_action == ACTION_BUILD_SUPPLY_DEPOT:
                if self.can_do(obs, actions.FUNCTIONS.Harvest_Gather_screen.id):
                    mfs = self.get_units_by_type(obs, units.Neutral.MineralField)
                    if len(mfs) > 0:
                        mf = random.choice(mfs)
                        if mf.x >= 0 and mf.y >= 0:
                            return actions.FUNCTIONS.Harvest_Gather_screen("now", (mf.x,mf.y))
            
        return actions.FUNCTIONS.no_op()

### [run code]

In [None]:
if __name__ == "__main__":
  app.run(main)

## 5. Detecting End of Game

In [None]:
import random
import time
import math
import os.path

import numpy as np
import pandas as pd


from pysc2.agents import base_agent
from pysc2.env import sc2_env
from pysc2.lib import actions, features, units
from absl import app

In [None]:
DATA_FILE = 'rlagent_with_sparse_reward_learning_data'

ACTION_DO_NOTHING = 'donothing'
ACTION_BUILD_SUPPLY_DEPOT = 'buildsupplydepot'
ACTION_BUILD_BARRACKS = 'buildbarracks'
ACTION_BUILD_MARINE = 'buildmarine'
ACTION_ATTACK = 'attack'

smart_actions = [
    ACTION_DO_NOTHING,
    ACTION_BUILD_SUPPLY_DEPOT,
    ACTION_BUILD_BARRACKS,
    ACTION_BUILD_MARINE,
]

for mm_x in range(0, 64):
    for mm_y in range(0, 64):
        if (mm_x + 1) % 32 == 0 and (mm_y + 1) % 32 == 0:
            smart_actions.append(ACTION_ATTACK + '_' + str(mm_x - 16) + '_' + str(mm_y - 16))

In [None]:
# reference from https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow
class QLearningTable:
    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
        self.actions = actions  # a list
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)

    def choose_action(self, observation):
        self.check_state_exist(observation)
        
        if np.random.uniform() < self.epsilon:
            # choose best action
            #state_action = self.q_table.ix[observation, :]
            state_action = self.q_table.loc[observation, self.q_table.columns[:]]
            
            # some actions have the same value
            state_action = state_action.reindex(np.random.permutation(state_action.index))
            
            action = state_action.idxmax()
        else:
            # choose random action
            action = np.random.choice(self.actions)
            
        return action

    def learn(self, s, a, r, s_):
        self.check_state_exist(s_)
        self.check_state_exist(s)
        
        #q_predict = self.q_table.ix[s, a]
        q_predict = self.q_table.loc[s, a]
        
        if s_ != 'terminal':
            #q_target = r + self.gamma * self.q_table.ix[s_, :].max()
            q_target = r + self.gamma * self.q_table.loc[s_, self.q_table.columns[:]].max()
        else:
            q_target = r  # next state is terminal
            
        # update
        #self.q_table.ix[s, a] += self.lr * (q_target - q_predict)
        self.q_table.loc[s, a] += self.lr * (q_target - q_predict)

    def check_state_exist(self, state):
        if state not in self.q_table.index:
            # append new state to q table
            self.q_table = self.q_table.append(pd.Series([0] * len(self.actions), index=self.q_table.columns, name=state))

In [None]:
class TerranSparseRewardRLAgent(base_agent.BaseAgent):
    def __init__(self):
        super(TerranSparseRewardRLAgent, self).__init__()
        
        self.qlearn = QLearningTable(actions=list(range(len(smart_actions))))
        
        self.previous_action = None
        self.previous_state = None
        
        self.cc_y = None
        self.cc_x = None
        
        self.move_number = 0
        
        if os.path.isfile(DATA_FILE + '.gz'):
            self.qlearn.q_table = pd.read_pickle(DATA_FILE + '.gz', compression='gzip')

    def transformDistance(self, x, x_distance, y, y_distance):
        if not self.base_top_left:
            return [x - x_distance, y - y_distance]
        
        return [x + x_distance, y + y_distance]
    
    def transformLocation(self, x, y):
        if not self.base_top_left:
            return [64 - x, 64 - y]
        
        return [x, y]
    
    def getMeanLocation(self, unitList):
        sum_x = 0
        sum_y = 0
        for unit in unitList:
            sum_x += unit.x
            sum_y += unit.y
        mean_x = sum_x / len(unitList)
        mean_y = sum_y / len(unitList)
        
        return [mean_x, mean_y]
    
    def splitAction(self, action_id):
        smart_action = smart_actions[action_id]
            
        x = 0
        y = 0
        if '_' in smart_action:
            smart_action, x, y = smart_action.split('_')

        return (smart_action, x, y)
    
    def unit_type_is_selected(self, obs, unit_type):
        if (len(obs.observation.single_select) > 0 and
            obs.observation.single_select[0].unit_type == unit_type):
              return True

        if (len(obs.observation.multi_select) > 0 and
            obs.observation.multi_select[0].unit_type == unit_type):
              return True

        return False

    def get_units_by_type(self, obs, unit_type):
        return [unit for unit in obs.observation.feature_units
                if unit.unit_type == unit_type]

    def can_do(self, obs, action):
        return action in obs.observation.available_actions
        
    def step(self, obs):
        super(TerranSparseRewardRLAgent, self).step(obs)

        #time.sleep(0.5)
        
        if obs.last():
            reward = obs.reward
        
            self.qlearn.learn(str(self.previous_state), self.previous_action, reward, 'terminal')
            
            self.qlearn.q_table.to_pickle(DATA_FILE + '.gz', 'gzip')
            
            self.previous_action = None
            self.previous_state = None
            
            self.move_number = 0
            
            return actions.FUNCTIONS.no_op()
        
        if obs.first():
            player_y, player_x = (obs.observation.feature_minimap.player_relative == features.PlayerRelative.SELF).nonzero()
            self.base_top_left = 1 if player_y.any() and player_y.mean() <= 31 else 0
        
        ccs = self.get_units_by_type(obs, units.Terran.CommandCenter)
        if len(ccs) > 0:
            self.cc_x, self.cc_y = self.getMeanLocation(ccs)
            
        cc_count = len(ccs)
        
        supply_depot_count = len(self.get_units_by_type(obs, units.Terran.SupplyDepot))

        barracks_count = len(self.get_units_by_type(obs, units.Terran.Barracks))

        army_supply = obs.observation.player.food_used
        
        if self.move_number == 0:
            self.move_number += 1
            
            current_state = np.zeros(8)
            current_state[0] = cc_count
            current_state[1] = supply_depot_count
            current_state[2] = barracks_count
            current_state[3] = army_supply
    
            hot_squares = np.zeros(4)        
            enemy_y, enemy_x = (obs.observation.feature_minimap.player_relative == features.PlayerRelative.ENEMY).nonzero()
            for i in range(0, len(enemy_y)):
                y = int(math.ceil((enemy_y[i] + 1) / 32))
                x = int(math.ceil((enemy_x[i] + 1) / 32))
                
                hot_squares[((y - 1) * 2) + (x - 1)] = 1
            
            if not self.base_top_left:
                hot_squares = hot_squares[::-1]
            
            for i in range(0, 4):
                current_state[i + 4] = hot_squares[i]
    
            if self.previous_action is not None:
                self.qlearn.learn(str(self.previous_state), self.previous_action, 0, str(current_state))
        
            rl_action = self.qlearn.choose_action(str(current_state))

            self.previous_state = current_state
            self.previous_action = rl_action
        
            smart_action, x, y = self.splitAction(self.previous_action)
            
            if smart_action == ACTION_BUILD_BARRACKS or smart_action == ACTION_BUILD_SUPPLY_DEPOT:
                if self.can_do(obs, actions.FUNCTIONS.select_point.id):
                    scvs = self.get_units_by_type(obs, units.Terran.SCV)
                    if len(scvs) > 0:
                        scv = random.choice(scvs)
                        if scv.x >= 0 and scv.y >= 0:
                            return actions.FUNCTIONS.select_point("select", (scv.x,
                                                                              scv.y))
                
            elif smart_action == ACTION_BUILD_MARINE:
                if self.can_do(obs, actions.FUNCTIONS.select_point.id):
                    barracks = self.get_units_by_type(obs, units.Terran.Barracks)
                    if len(barracks) > 0:
                        barrack = random.choice(barracks)
                        if barrack.x >= 0 and barrack.y >= 0:
                            return actions.FUNCTIONS.select_point("select_all_type", (barrack.x,
                                                                                  barrack.y))
                
            elif smart_action == ACTION_ATTACK:
                if self.can_do(obs, actions.FUNCTIONS.select_army.id):
                    return actions.FUNCTIONS.select_army("select")
                
        elif self.move_number == 1:
            self.move_number += 1
            
            smart_action, x, y = self.splitAction(self.previous_action)
                
            if smart_action == ACTION_BUILD_SUPPLY_DEPOT:
                if supply_depot_count < 2 and self.can_do(obs, actions.FUNCTIONS.Build_SupplyDepot_screen.id):
                    if len(ccs) > 0:
                        if supply_depot_count == 0:
                            target = self.transformDistance(self.cc_x, -35, self.cc_y, 0)
                        elif supply_depot_count == 1:
                            target = self.transformDistance(self.cc_x, -25, self.cc_y, -25)
    
                        return actions.FUNCTIONS.Build_SupplyDepot_screen("now", target) 
            
            elif smart_action == ACTION_BUILD_BARRACKS:
                if barracks_count < 2 and self.can_do(obs, actions.FUNCTIONS.Build_Barracks_screen.id):
                    if len(ccs) > 0:
                        if  barracks_count == 0:
                            target = self.transformDistance(self.cc_x, 15, self.cc_y, -9)
                        elif  barracks_count == 1:
                            target = self.transformDistance(self.cc_x, 15, self.cc_y, 12)
    
                        return actions.FUNCTIONS.Build_Barracks_screen("now", target)
    
            elif smart_action == ACTION_BUILD_MARINE:
                if self.can_do(obs, actions.FUNCTIONS.Train_Marine_quick.id):
                    return actions.FUNCTIONS.Train_Marine_quick("queued")
        
            elif smart_action == ACTION_ATTACK:
                if self.can_do(obs, actions.FUNCTIONS.Attack_minimap.id):
                    x_offset = random.randint(-1, 1)
                    y_offset = random.randint(-1, 1)
                    
                    return actions.FUNCTIONS.Attack_minimap("now", self.transformLocation(int(x) + (x_offset * 8), int(y) + (y_offset * 8)))
            
        elif self.move_number == 2:
            self.move_number = 0
            
            smart_action, x, y = self.splitAction(self.previous_action)
                
            if smart_action == ACTION_BUILD_BARRACKS or smart_action == ACTION_BUILD_SUPPLY_DEPOT:
                if self.can_do(obs, actions.FUNCTIONS.Harvest_Gather_screen.id):
                    mfs = self.get_units_by_type(obs, units.Neutral.MineralField)
                    if len(mfs) > 0:
                        mf = random.choice(mfs)
                        if mf.x >= 0 and mf.y >= 0:
                            return actions.FUNCTIONS.Harvest_Gather_screen("queued", (mf.x,mf.y))
            
        return actions.FUNCTIONS.no_op()

### [run code]

In [None]:
if __name__ == "__main__":
  app.run(main)

## 6. Refining 
- Ignoreing Learing When State Does Not Change : 'Maximization Bias in RL' Problem
- Preventing Invalid Actions
- Adding Our Unit Locations to the State

![Winning rate graph](./images/rlagent_with_sparse_reward_learning_scoreTerran-Terran-350_Eps.png)

In [10]:
import random
import time
import math
import os.path

import numpy as np
import pandas as pd
from collections import deque
import pickle

from pysc2.agents import base_agent
from pysc2.env import sc2_env
from pysc2.lib import actions, features, units
from absl import app

In [11]:
DATA_FILE = 'rlagent_with_sparse_reward_learning_data'
SCORE_FILE = 'rlagent_with_sparse_reward_learning_score'

ACTION_DO_NOTHING = 'donothing'
ACTION_BUILD_SUPPLY_DEPOT = 'buildsupplydepot'
ACTION_BUILD_BARRACKS = 'buildbarracks'
ACTION_BUILD_MARINE = 'buildmarine'
ACTION_ATTACK = 'attack'

smart_actions = [
    ACTION_DO_NOTHING,
    ACTION_BUILD_SUPPLY_DEPOT,
    ACTION_BUILD_BARRACKS,
    ACTION_BUILD_MARINE,
]

for mm_x in range(0, 64):
    for mm_y in range(0, 64):
        if (mm_x + 1) % 32 == 0 and (mm_y + 1) % 32 == 0:
            smart_actions.append(ACTION_ATTACK + '_' + str(mm_x - 16) + '_' + str(mm_y - 16))
            
scores = []                        # list containing scores from each episode
scores_window = deque(maxlen=100)  # last 100 scores

### 첫째, Ignoreing Learing When State Does Not Change : 'Maximization Bias in RL' Problem 해결 방법
#### 학습의 초기과정에서 같은 State에 자주 도착하는 경우 덜 가치있는 Action을 가장 가치있는 Action으로 밀어 붙이는 경향이 생깁니다. 이 것이 반복되면 시간이 지남에 따라 완전히 다른 State 대신 동일한 State에 더 자주 도착하면 모든 보상이 0에 가까워 지는 문제가 생긴다.

#### 이를 해결하기 위해 아래와 같이 간단한 방법을 추가함.

    def learn(self, s, a, r, s_):
        if s == s_:
            return


### 둘째, Preventing Invalid Actions

< Invalid Actions 의 예>
- supply depot 제한 갯수 2에 도달했거나 supply depot를 건설할 SCV가 없는 경우 에이전트가 supply depot를 건설 할 수 없도로 한다.
- supply depot가 없거나 barrack 제한 갯수 2에 도달했거나 barrack 제한 갯수를 지을 SCV이 없다면 에이전트가 barrack를 지을 수 없도록 한다.
- barrack가 없거나 supply limit에 도달한 경우 marine을 훈련시키지 않는다.

#### 학습과정에서 에이전트가 Invalid Actions을 시도하는 것이 반복적으로 관찰됩니다. 사용가능한 Action의 수가 Available Actions 절반인 경우가 많기 때문에 에이전트는 불필요한 Action에서 학습하는 데 시간을 많이 보내게 됩니다.
#### 이러한 Invalid Actions을 필터링하여 에이전트가 State 변경으로 이어지는 Action을 시도하는 데 집중하도록하여 Exploration을 줄이고 학습 시간을 개선 할 수 있습니다.

#### 이를 해결하기 위해 아래와 같이 self.disallowed_actions 와 excluded_action 을 활용하는 방법을 추가함.

### 셋째, Adding Our Unit Locations to the State

#### "아군이 어디에 있는지 모른다면 어떤 위치가 공격하기에 가장 좋은지 알 수 있지 않을까요?"

In [12]:
# reference from https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow
class QLearningTable:
    def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
        self.actions = actions  # a list
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
        self.disallowed_actions = {}
        
    def choose_action(self, observation, excluded_actions=[]):
        self.check_state_exist(observation)
        
        self.disallowed_actions[observation] = excluded_actions
        
        #state_action = self.q_table.ix[observation, :]
        #state_action = self.q_table.loc[observation, self.q_table.columns[:]]
        state_action = self.q_table.loc[observation, :]
        
        for excluded_action in excluded_actions:
            del state_action[excluded_action]
        
        if np.random.uniform() < self.epsilon:            
            # some actions have the same value
            state_action = state_action.reindex(np.random.permutation(state_action.index))
            
            action = state_action.idxmax()
        else:
            # choose random action
            action = np.random.choice(state_action.index)
            
        return action

    def learn(self, s, a, r, s_):
        if s == s_:
            return
        
        self.check_state_exist(s_)
        self.check_state_exist(s)
        
        #q_predict = self.q_table.ix[s, a]
        q_predict = self.q_table.loc[s, a]
        
        #s_rewards = self.q_table.ix[s_, :]
        #s_rewards = self.q_table.loc[s_, self.q_table.columns[:]]
        s_rewards = self.q_table.loc[s_, :]
        
        if s_ in self.disallowed_actions:
            for excluded_action in self.disallowed_actions[s_]:
                del s_rewards[excluded_action]
        
        if s_ != 'terminal':
            q_target = r + self.gamma * s_rewards.max()
        else:
            q_target = r  # next state is terminal
            
        # update
        #self.q_table.ix[s, a] += self.lr * (q_target - q_predict)
        self.q_table.loc[s, a] += self.lr * (q_target - q_predict)

    def check_state_exist(self, state):
        if state not in self.q_table.index:
            # append new state to q table
            self.q_table = self.q_table.append(pd.Series([0] * len(self.actions), index=self.q_table.columns, name=state))

In [13]:
class TerranSparseRewardRLAgent(base_agent.BaseAgent):
    def __init__(self):
        super(TerranSparseRewardRLAgent, self).__init__()
        
        self.qlearn = QLearningTable(actions=list(range(len(smart_actions))))
        
        self.previous_action = None
        self.previous_state = None
        
        self.cc_y = None
        self.cc_x = None
        
        self.move_number = 0
        
        if os.path.isfile(DATA_FILE + '.gz'):
            self.qlearn.q_table = pd.read_pickle(DATA_FILE + '.gz', compression='gzip')

    def transformDistance(self, x, x_distance, y, y_distance):
        if not self.base_top_left:
            return [x - x_distance, y - y_distance]
        
        return [x + x_distance, y + y_distance]
    
    def transformLocation(self, x, y):
        if not self.base_top_left:
            return [64 - x, 64 - y]
        
        return [x, y]
    
    def getMeanLocation(self, unitList):
        sum_x = 0
        sum_y = 0
        for unit in unitList:
            sum_x += unit.x
            sum_y += unit.y
        mean_x = sum_x / len(unitList)
        mean_y = sum_y / len(unitList)
        
        return [mean_x, mean_y]
    
    def splitAction(self, action_id):
        smart_action = smart_actions[action_id]
            
        x = 0
        y = 0
        if '_' in smart_action:
            smart_action, x, y = smart_action.split('_')

        return (smart_action, x, y)
    
    def unit_type_is_selected(self, obs, unit_type):
        if (len(obs.observation.single_select) > 0 and
            obs.observation.single_select[0].unit_type == unit_type):
              return True

        if (len(obs.observation.multi_select) > 0 and
            obs.observation.multi_select[0].unit_type == unit_type):
              return True

        return False

    def get_units_by_type(self, obs, unit_type):
        return [unit for unit in obs.observation.feature_units
                if unit.unit_type == unit_type]

    def can_do(self, obs, action):
        return action in obs.observation.available_actions
        
    def step(self, obs):
        super(TerranSparseRewardRLAgent, self).step(obs)

        #time.sleep(0.5)
        
        if obs.last():
            reward = obs.reward
        
            self.qlearn.learn(str(self.previous_state), self.previous_action, reward, 'terminal')
            
            self.qlearn.q_table.to_pickle(DATA_FILE + '.gz', 'gzip')
            
            scores_window.append(obs.reward)  # save most recent reward
            win_rate = scores_window.count(1)/len(scores_window)*100
            tie_rate = scores_window.count(0)/len(scores_window)*100
            lost_rate = scores_window.count(-1)/len(scores_window)*100
            
            scores.append([win_rate, tie_rate, lost_rate])  # save most recent score(win_rate, tie_rate, lost_rate)
            with open(SCORE_FILE + '.txt', "wb") as fp:
                pickle.dump(scores, fp)
            
            self.previous_action = None
            self.previous_state = None
            
            self.move_number = 0
            
            return actions.FUNCTIONS.no_op()
        
        if obs.first():
            player_y, player_x = (obs.observation.feature_minimap.player_relative == features.PlayerRelative.SELF).nonzero()
            self.base_top_left = 1 if player_y.any() and player_y.mean() <= 31 else 0
        
        ccs = self.get_units_by_type(obs, units.Terran.CommandCenter)
        if len(ccs) > 0:
            self.cc_x, self.cc_y = self.getMeanLocation(ccs)
            
        cc_count = len(ccs)
        
        supply_depot_count = len(self.get_units_by_type(obs, units.Terran.SupplyDepot))

        barracks_count = len(self.get_units_by_type(obs, units.Terran.Barracks))
        
        supply_used = obs.observation.player.food_used
        supply_limit = obs.observation.player.food_cap
        army_supply = obs.observation.player.food_army
        worker_supply = obs.observation.player.food_workers
        
        supply_free = supply_limit - supply_used
        
        if self.move_number == 0:
            self.move_number += 1
            
            
            current_state = np.zeros(12)
            current_state[0] = cc_count
            current_state[1] = supply_depot_count
            current_state[2] = barracks_count
            current_state[3] = army_supply
    
            hot_squares = np.zeros(4)        
            enemy_y, enemy_x = (obs.observation.feature_minimap.player_relative == features.PlayerRelative.ENEMY).nonzero()
            for i in range(0, len(enemy_y)):
                y = int(math.ceil((enemy_y[i] + 1) / 32))
                x = int(math.ceil((enemy_x[i] + 1) / 32))
                
                hot_squares[((y - 1) * 2) + (x - 1)] = 1
            
            if not self.base_top_left:
                hot_squares = hot_squares[::-1]
            
            for i in range(0, 4):
                current_state[i + 4] = hot_squares[i]
                
            green_squares = np.zeros(4)        
            friendly_y, friendly_x = (obs.observation.feature_minimap.player_relative == features.PlayerRelative.SELF).nonzero()
            for i in range(0, len(friendly_y)):
                y = int(math.ceil((friendly_y[i] + 1) / 32))
                x = int(math.ceil((friendly_x[i] + 1) / 32))
                
                green_squares[((y - 1) * 2) + (x - 1)] = 1
            
            if not self.base_top_left:
                green_squares = green_squares[::-1]
            
            for i in range(0, 4):
                current_state[i + 8] = green_squares[i]
    
            if self.previous_action is not None:
                self.qlearn.learn(str(self.previous_state), self.previous_action, 0, str(current_state))
        
            excluded_actions = []
            if supply_depot_count == 2 or worker_supply == 0:
                excluded_actions.append(1)
                
            if supply_depot_count == 0 or barracks_count == 2 or worker_supply == 0:
                excluded_actions.append(2)

            if supply_free == 0 or barracks_count == 0:
                excluded_actions.append(3)
                
            if army_supply == 0:
                excluded_actions.append(4)
                excluded_actions.append(5)
                excluded_actions.append(6)
                excluded_actions.append(7)
        
            rl_action = self.qlearn.choose_action(str(current_state), excluded_actions)

            self.previous_state = current_state
            self.previous_action = rl_action
        
            smart_action, x, y = self.splitAction(self.previous_action)
            
            if smart_action == ACTION_BUILD_BARRACKS or smart_action == ACTION_BUILD_SUPPLY_DEPOT:
                if self.can_do(obs, actions.FUNCTIONS.select_point.id):
                    scvs = self.get_units_by_type(obs, units.Terran.SCV)
                    if len(scvs) > 0:
                        scv = random.choice(scvs)
                        if scv.x >= 0 and scv.y >= 0:
                            return actions.FUNCTIONS.select_point("select", (scv.x,
                                                                              scv.y))
                
            elif smart_action == ACTION_BUILD_MARINE:
                if self.can_do(obs, actions.FUNCTIONS.select_point.id):
                    barracks = self.get_units_by_type(obs, units.Terran.Barracks)
                    if len(barracks) > 0:
                        barrack = random.choice(barracks)
                        if barrack.x >= 0 and barrack.y >= 0:
                            return actions.FUNCTIONS.select_point("select_all_type", (barrack.x,
                                                                                  barrack.y))
                
            elif smart_action == ACTION_ATTACK:
                if self.can_do(obs, actions.FUNCTIONS.select_army.id):
                    return actions.FUNCTIONS.select_army("select")
                
        elif self.move_number == 1:
            self.move_number += 1
            
            smart_action, x, y = self.splitAction(self.previous_action)
                
            if smart_action == ACTION_BUILD_SUPPLY_DEPOT:
                if supply_depot_count < 2 and self.can_do(obs, actions.FUNCTIONS.Build_SupplyDepot_screen.id):
                    if len(ccs) > 0:
                        if supply_depot_count == 0:
                            target = self.transformDistance(self.cc_x, -35, self.cc_y, 0)
                        elif supply_depot_count == 1:
                            target = self.transformDistance(self.cc_x, -25, self.cc_y, -25)
    
                        return actions.FUNCTIONS.Build_SupplyDepot_screen("now", target) 
            
            elif smart_action == ACTION_BUILD_BARRACKS:
                if barracks_count < 2 and self.can_do(obs, actions.FUNCTIONS.Build_Barracks_screen.id):
                    if len(ccs) > 0:
                        if  barracks_count == 0:
                            target = self.transformDistance(self.cc_x, 15, self.cc_y, -9)
                        elif  barracks_count == 1:
                            target = self.transformDistance(self.cc_x, 15, self.cc_y, 12)
    
                        return actions.FUNCTIONS.Build_Barracks_screen("now", target)
    
            elif smart_action == ACTION_BUILD_MARINE:
                if self.can_do(obs, actions.FUNCTIONS.Train_Marine_quick.id):
                    return actions.FUNCTIONS.Train_Marine_quick("queued")
        
            elif smart_action == ACTION_ATTACK:
                if self.can_do(obs, actions.FUNCTIONS.Attack_minimap.id):
                    x_offset = random.randint(-1, 1)
                    y_offset = random.randint(-1, 1)
                    
                    return actions.FUNCTIONS.Attack_minimap("now", self.transformLocation(int(x) + (x_offset * 8), int(y) + (y_offset * 8)))
            
        elif self.move_number == 2:
            self.move_number = 0
            
            smart_action, x, y = self.splitAction(self.previous_action)
                
            if smart_action == ACTION_BUILD_BARRACKS or smart_action == ACTION_BUILD_SUPPLY_DEPOT:
                if self.can_do(obs, actions.FUNCTIONS.Harvest_Gather_screen.id):
                    mfs = self.get_units_by_type(obs, units.Neutral.MineralField)
                    if len(mfs) > 0:
                        mf = random.choice(mfs)
                        if mf.x >= 0 and mf.y >= 0:
                            return actions.FUNCTIONS.Harvest_Gather_screen("queued", (mf.x,mf.y))
            
        return actions.FUNCTIONS.no_op()

### [run code]

In [None]:
if __name__ == "__main__":
  app.run(main)

I0908 22:54:34.908689 4549250496 sc_process.py:135] Launching SC2: /Applications/StarCraft II/Versions/Base81102/SC2.app/Contents/MacOS/SC2 -listen 127.0.0.1 -port 24748 -dataDir /Applications/StarCraft II/ -tempDir /var/folders/r1/x6k135_915z463fc7lc4hkp40000gn/T/sc-s__1457n/ -displayMode 0 -windowwidth 640 -windowheight 480 -windowx 50 -windowy 50
I0908 22:54:35.040081 4549250496 remote_controller.py:167] Connecting to: ws://127.0.0.1:24748/sc2api, attempt: 0, running: True
I0908 22:54:36.046929 4549250496 remote_controller.py:167] Connecting to: ws://127.0.0.1:24748/sc2api, attempt: 1, running: True
I0908 22:54:37.051900 4549250496 remote_controller.py:167] Connecting to: ws://127.0.0.1:24748/sc2api, attempt: 2, running: True
I0908 22:54:38.053664 4549250496 remote_controller.py:167] Connecting to: ws://127.0.0.1:24748/sc2api, attempt: 3, running: True
I0908 22:54:39.059527 4549250496 remote_controller.py:167] Connecting to: ws://127.0.0.1:24748/sc2api, attempt: 4, running: True
I09

   0/no_op                                              ()
   1/move_camera                                        (1/minimap [64, 64])
   2/select_point                                       (6/select_point_act [4]; 0/screen [84, 84])
   3/select_rect                                        (7/select_add [2]; 0/screen [84, 84]; 2/screen2 [84, 84])
   4/select_control_group                               (4/control_group_act [5]; 5/control_group_id [10])
 453/Stop_quick                                         (3/queued [2])
 230/Effect_Spray_screen                                (3/queued [2]; 0/screen [84, 84])
 549/Effect_Spray_minimap                               (3/queued [2]; 1/minimap [64, 64])
 264/Harvest_Gather_screen                              (3/queued [2]; 0/screen [84, 84])
 451/Smart_screen                                       (3/queued [2]; 0/screen [84, 84])
 452/Smart_minimap                                      (3/queued [2]; 1/minimap [64, 64])
 331/Move_screen    

I0908 22:55:48.199373 4549250496 sc2_env.py:725] Episode 1 finished after 12520 game steps. Outcome: [1], reward: [1], score: [7100]
I0908 22:55:52.745210 4549250496 sc2_env.py:507] Starting episode 2: [terran, terran] on Simple64


  43/Build_Bunker_screen                                (3/queued [2]; 0/screen [84, 84])
  44/Build_CommandCenter_screen                         (3/queued [2]; 0/screen [84, 84])
  11/build_queue                                        (11/build_queue_id [10])
   6/select_idle_worker                                 (10/select_worker [4])


I0908 22:57:13.768297 4549250496 sc2_env.py:725] Episode 2 finished after 22472 game steps. Outcome: [-1], reward: [-1], score: [7215]
I0908 22:57:18.352750 4549250496 sc2_env.py:507] Starting episode 3: [terran, terran] on Simple64


### [Winning rate graph]

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

SCORE_FILE = 'rlagent_with_sparse_reward_learning_score'

In [None]:
with open(SCORE_FILE + '.txt', "rb") as fp:
    scores = pickle.load(fp)

In [None]:
np_scores = np.array(scores)
np_scores

In [None]:
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(np_scores)), np_scores.T[0], color='r', label='win rate')
plt.plot(np.arange(len(np_scores)), np_scores.T[1], color='g', label='tie rate')
plt.plot(np.arange(len(np_scores)), np_scores.T[2], color='b', label='lose rate')
plt.ylabel('Score %')
plt.xlabel('Episode #')
plt.legend(loc='best')
plt.show()