# SC2 AI prototype

In [49]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
import pandas as pd
from os.path import isfile, join
import os, sys
import matplotlib.pyplot as plt


In [50]:
# load data
def loadDataEpisode(episodeNum):
    pathRoot = "./data/"
    
    df_action = pd.read_csv(pathRoot+"actions."+str(episodeNum)+".tsv", header=None, sep='\t')
    df_effectiveAction = pd.read_csv(pathRoot+"effectiveAction."+str(episodeNum)+".tsv", header=None, sep='\t')
    df_globalFeatures = pd.read_csv(pathRoot+"globalFeatures."+str(episodeNum)+".tsv", header=None, sep='\t')
    df_macroRewards = pd.read_csv(pathRoot+"macroRewards."+str(episodeNum)+".tsv", header=None, sep='\t')
    
    return df_globalFeatures, df_macroRewards, df_action, df_effectiveAction

In [51]:
gFeature, rewards, actions, effectiveAction = loadDataEpisode(0)

In [52]:
gFeature.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,50,0,12,14,2,12,0,0,0,0,0
1,50,0,12,14,2,12,0,0,0,0,0
2,0,0,13,14,1,12,0,0,0,1,0
3,0,0,13,14,1,12,0,0,0,1,0
4,0,0,13,14,1,12,0,0,0,1,0


In [53]:
gFeature.values[:-1].shape

(1675, 11)

In [54]:
gFeature.values.shape

(1676, 11)

In [55]:
effectiveAction.head(10)

Unnamed: 0,0
0,2
1,467
2,2
3,0
4,2
5,0
6,2
7,0
8,2
9,0


In [64]:
# see: https://github.com/deepmind/pysc2/blob/7b7afd7eeae985e6498855ac368c865ed9d527fb/pysc2/lib/actions.py
hardCodeActionFuncMap = [0,2,467,483] #np.unique(effectiveAction.values)

# sanitize action and minimize to the absolute effective actions
# TODO: make network realize in effective actions automatically?
def sanitizeMacroAction(effectiveAction):
    effectiveActionNp = np.copy(effectiveAction.values)
    (sampleLen, dim) = effectiveActionNp.shape
    
    for i in range(sampleLen):
        if i > 0:
            prevId = effectiveActionNp[i-1]
            curId = effectiveActionNp[i]
            
            # note this rule is only valid for macro, select statements
            if curId == 0 and prevId == 2:
                effectiveActionNp[i-1] = 0

    return effectiveActionNp

tempAction = sanitizeMacroAction(effectiveAction)
pd.DataFrame(tempAction).head(10)

Unnamed: 0,0
0,2
1,467
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [65]:
def convertToOneHotEffectiveAction(effectiveActionNp):
    dim = len(hardCodeActionFuncMap)
    npEffectiveAction = effectiveActionNp
    sampleLen = npEffectiveAction.shape[0]
    effectiveActionEncoding = np.zeros((sampleLen, dim))
    
    for i in range(sampleLen):
        idx = hardCodeActionFuncMap.index(npEffectiveAction[i])
        effectiveActionEncoding[i, idx] = 1
    
    
    return effectiveActionEncoding

oneHotEncodedAction = convertToOneHotEffectiveAction(tempAction)

pd.DataFrame(oneHotEncodedAction).head(10)

Unnamed: 0,0,1,2,3
0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0


In [66]:
oneHotEncodedAction.shape

(1675, 4)

In [70]:
globalFeature = np.copy(gFeature.values[:-1])

concatFeatureAction = np.concatenate((globalFeature,oneHotEncodedAction), axis = 1)
pd.DataFrame(concatFeatureAction).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,50.0,0.0,12.0,14.0,2.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,50.0,0.0,12.0,14.0,2.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,13.0,14.0,1.0,12.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,13.0,14.0,1.0,12.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,13.0,14.0,1.0,12.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
5,0.0,0.0,13.0,14.0,1.0,12.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
6,0.0,0.0,13.0,14.0,1.0,12.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
7,0.0,0.0,13.0,14.0,1.0,12.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
8,0.0,0.0,13.0,14.0,1.0,12.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
9,0.0,0.0,13.0,14.0,1.0,12.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [71]:
StepMul = 4
SecToStepMultiplier = (16/StepMul * 1.4)

def clip_log(probs):
    return tf.log(tf.clip_by_value(probs, 1e-12, 1.0))

class ZergMacroModel:
    # sess = tf.Session()
    def __init__(self, sess, featureDim = 15, actionDim = 4):
        
        # TODO: mathematical way to come up with these delays        
#         self.rewardPredictionDelaySec = np.asarray([1/SecToStepMultiplier, 20, 60, 240])
#         self.rewardMovingWindowSec = np.asarray([1/SecToStepMultiplier, 4, 5, 15])
        
#         self.rewardDelayStep = self.rewardPredictionDelaySec * SecToStepMultiplier
#         self.rewardMovingWindowSec = self.rewardMovingWindowSec * SecToStepMultiplier
        
        self.value_coef = 1
        self.entropy_coef = 0.1
        # TODO: cost for not doing no_op
        self.policy_coef = 1
        self.regularizationCoef = 0.001
        
        self.reg = tf.contrib.layers.l2_regularizer(scale=self.regularizationCoef)
            
        # first try hard code
        step1Model = simple_model_fun(featureDim, actionDim, "step1Model")
        step60Model = simple_model_fun(featureDim*10, actionDim, "step60Model", hiddenSize = 30) # uses 10 past steps observation
        
    def simple_model_fun(featureDim, actionDim, name, hiddenSize = 10):
        # draw reference to https://github.com/inoryy/pysc2-rl-agent/blob/master/rl/agent.py
        
        with tf.variable_scope(name):
            modelInput = tf.placeholder(tf.float32, [None, featureDim])
            modelValue = tf.placeholder(tf.float32, [None])
            
            fnn1 = tf.layers.dense(inputs=modelInput, units=hiddenSize, activation=tf.nn.tanh, kernel_regularizer = self.reg)
            value_network = tf.layers.dense(inputs=fnn1, units=1, activation=None, kernel_regularizer = self.reg)
            
            fnn2 = tf.layers.dense(inputs=fnn1, units=actionDim, activation=None, kernel_regularizer = self.reg)
            policy_network = tf.nn.softmax(fnn2)
            
            adv = tf.stop_gradient(modelValue - value_network)
            logpolicy = clip_log(policy_network)
            entropy = -tf.reduce_sum(policy_network * logpolicy, axis=-1)
            
            value_loss = self.value_coef * tf.reduce_mean(tf.square(modelValue - value_network))
            entropy_loss = -self.entropy_coef * tf.reduce_mean(entropy)
            policy_loss = -tf.reduce_mean(logpolicy * adv)
            
            finalLoss = value_loss + entropy_loss + policy_loss
            
        return [modelInput, modelValue, finalLoss, policy_network, value_network]
    
    
    
    
    def trainWithEpisode(concatFeatureAction):
        
        
        
        return
        
        
    def featureExtractor(concatFeatureAction):
        
        
        return
            
