In [25]:
import os.path
import numpy as np
import random
import timeit
import tensorflow as tf
# from flat_game import carmunk

In [26]:
EPSILON = 1
GAMMA = 0.9
CAR_SENSORS, CAT_SENSORS, NUM_ACTIONS = 6, 12, 3
N1, N2 = 164, 150
LEARNING_RATE = 1e-3
SUMMARY_STEP = 1000
LOGS_DIR = './logs'

observe = 1000  # Number of frames to observe before training.
train_frames = 10000  # Number of frames to play.
batch_size = 500
buffer_size = 50000

use_red_team = True
use_obstacles = False
car_crash_penalty = -500
cat_crash_penalty = -500
cat_success_reward = 1000

In [27]:
# Taken from: https://goo.gl/SrpGW0
from IPython.display import clear_output, Image, display, HTML

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [28]:
# Taken from TensorFlow MNIST Tutorial
# https://goo.gl/gkZs36

# We can't initialize these variables to 0 - the network will get stuck.
def weight_variable(shape):
    """Create a weight variable with appropriate initialization."""
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    """Create a bias variable with appropriate initialization."""
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def variable_summaries(var):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.scalar_summary('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.scalar_summary('stddev', stddev)
        tf.scalar_summary('max', tf.reduce_max(var))
        tf.scalar_summary('min', tf.reduce_min(var))
        tf.histogram_summary('histogram', var)
        
def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
    """Reusable code for making a simple neural net layer.
    It does a matrix multiply, bias add, and then uses relu to nonlinearize.
    It also sets up name scoping so that the resultant graph is easy to read,
    and adds a number of summary ops.
    """
    # Adding a name scope ensures logical grouping of the layers in the graph.
    with tf.name_scope(layer_name):
        # This Variable will hold the state of the weights for the layer
        with tf.name_scope('weights'):
            weights = weight_variable([input_dim, output_dim])
            variable_summaries(weights)
        with tf.name_scope('biases'):
            biases = bias_variable([output_dim])
            variable_summaries(biases)
        with tf.name_scope('Wx_plus_b'):
            preactivate = tf.matmul(input_tensor, weights) + biases
            tf.histogram_summary('pre_activations', preactivate)
        if act is not None:
            activations = act(preactivate, name='activation')
            tf.histogram_summary('activations', activations)
            return activations
        return preactivate

In [29]:
def get_net(x, q, action, input_size, prefix):
    l1 = nn_layer(x, input_size, N1, prefix + 'L1')
    print l1
    l2 = nn_layer(l1, N1, N2, prefix + 'L2')
    print l2
    l3 = nn_layer(l2, N2, NUM_ACTIONS, prefix + 'L3', act=None)
    print l3
    
    one_hot_action = tf.one_hot(action, depth=NUM_ACTIONS, name=prefix + 'OneHotAction')
    print one_hot_action
    
    # Model
    with tf.name_scope(prefix + 'Model'):
        q_pred = tf.reduce_sum(l3 * one_hot_action, 1, keep_dims=True, name='PredictedQVal')
        print q_pred

    # Loss
    with tf.name_scope(prefix + 'Loss'):
        q_diff = tf.sub(q, q_pred, name='QDiff')
        print q_diff
        loss = tf.reduce_sum(tf.square(q_diff, name='SquaredLoss'), name='BatchLoss')
        print loss
        
    # Optimizer
    with tf.name_scope(prefix + 'Optimizer'):
        optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)
    
    # Summaries
    with tf.name_scope(prefix + 'Summaries'):
        # Create a summary to monitor cost tensor
        tf.scalar_summary("loss", loss)
        # Merge all summaries into a single op
        summary = tf.merge_all_summaries()
    
    net = {
        'model': q_pred,
        'optimizer': optimizer,
        'loss': loss,
        'summary': summary
    }
    return net

In [30]:
# Mostly adopted from learning.py
def get_random_action():
    return np.random.randint(0, NUM_ACTIONS)  # random
    
def get_epsilon_greedy_action(state, optimizer, sess, red_team):
    # Choose an action.
    if random.random() < EPSILON:
        return get_random_action()
    else:
        # Get Q values for each action.
        if red_team:
            qval = sess.run(optimizer, feed_dict={cat_x: state})
        else:
            qval = sess.run(optimizer, feed_dict={car_x: state})
        return np.argmax(qval)
        
def get_action(t, state, model, red_team):
    if red_team and not use_red_team:
        return None
    if t < observe:
        action = get_random_action()
    else:
        action = get_epsilon_greedy_action(state, model, red_team)
    return action

def train_batch(t, replay, net, red_team):
    # If we're done observing, start training.
    if t > observe:
        # If we've stored enough in our buffer, pop the oldest.
        if len(replay) > buffer_size:
            replay.pop(0)
        X_train, y_train, train_actions = get_minibatch(replay, net.model, red_team)
        # Train the model on this batch.
        if red_team:
            s, _ = sess.run([net.summary, net.optimizer],
                            feed_dict={cat_x: X_train, cat_q: y_train, cat_action: train_actions})
        else:
            s, _ = sess.run([net.summary, net.optimizer], 
                            feed_dict={car_x: X_train, car_q: y_train, car_action: train_actions})
        if t % SUMMARY_STEP == 0:
            summary_writer.add_summary(s)
            
def get_minibatch(replay, model, red_team):
    # Randomly sample our experience replay memory
    minibatch = random.sample(replay, batch_size)
    # Get training values.
    return process_minibatch(minibatch, model, red_team)

def process_minibatch(minibatch, model, red_team):
    X_train, y_train, y_train_actions = [], [], []
    # Loop through our batch and create arrays for X and y
    # so that we can fit our model at every step.
    for record in minibatch:
        # Get stored values.
        old_state, action, reward, new_state = record
        # Get prediction on new state.
        if red_team:
            new_qval = sess.run(model, feed_dict={cat_x: new_state})
        else:
            new_qval = sess.run(model, feed_dict={car_x: new_state})
        # Get best move for new state.
        max_qval = np.max(new_qval)
        y = np.zeros((1, NUM_ACTIONS))
        update = get_update(reward, max_qval, red_team)
        if red_team:
            X_train.append(old_state.reshape(CAT_SENSORS,))
        else:
            X_train.append(old_state.reshape(CAR_SENSORS,))
        y_train.append(np.array(update))
        y_train_actions.append(np.array(action))

    X_train = tf.pack(X_train)
    y_train = tf.pack(y_train)
    return X_train, y_train, y_train_actions

def train_model(car_net, cat_net, sess):
    global EPSILON
    # Just stuff used below.
    max_car_distance, car_distance, t = 0, 0, 0
    data_collect, car_replay, cat_replay = [], [], []

    # Create a new game instance.
    game_state = carmunk.GameState()

    # Get initial state by doing nothing and getting the state.
    _, car_state, cat_state = game_state.frame_step((2))

    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.
    while t < train_frames:
        t += 1
        car_distance += 1
        
        car_action = get_action(t, car_state, car_net.model, False)
        cat_action = get_action(t, cat_state, cat_net.model, True)

        # Take action, observe new state and get our treat.
        new_car_state, car_reward, new_cat_state, new_cat_reward = game_state.frame_step(
            car_action, cat_action)

        # Experience replay storage.
        car_replay.append((car_state, car_action, car_reward, new_car_state))
        # Train minibatch
        train_batch(t, car_replay, car_net, sess, False)
        # Update the starting state with S'.
        car_state = new_car_state
        
        if use_red_team:
            cat_replay.append((cat_state, cat_action, cat_reward, new_cat_state))
            train_batch(t, cat_replay, cat_net, sess, True)
            cat_state = new_cat_state

        # TODO: Do we want separate epsilongs ?
        # Decrement epsilon over time.
        if EPSILON > 0.1 and t > observe:
            EPSILON -= (1/train_frames)

        # We died, so update stuff.
        if car_reward == car_crash_penalty:
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])

            # Update max.
            if car_distance > max_car_distance:
                max_car_distance = car_distance

            # Time it.
            tot_time = timeit.default_timer() - start_time
            fps = car_distance / tot_time

            # Output some stuff so we can watch.
            print("Max: %d at %d\tepsilon %f\t(%d)\t%f fps" %
                  (max_car_distance, t, EPSILON, car_distance, fps))

            # Reset.
            car_distance = 0
            start_time = timeit.default_timer()
            
def get_update(reward, max_qval, red_team=False):
    is_terminal_state = False
    if red_team:
        if reward == cat_crash_penalty or reward == cat_success_reward:
            is_terminal_state = True
    else:
        if reward == car_crash_penalty:
            is_terminal_state = True
    # Update depending on whether state is terminal or not.
    if is_terminal_state:
        update = (reward + (GAMMA * max_qval))
    else:  # terminal state
        update = reward
    return update
            
def get_experiment_dir():
    experiment_str = 'redTeam:{0}_batchSize:{1}'.format(use_red_team, batch_size)
    directory = os.path.join(LOGS_DIR, 'experiment_str')
    if not os.path.exists(directory):
        os.makedirs(directory)
    return directory

In [31]:
tf.reset_default_graph()

# Input state
car_x = tf.placeholder(tf.float32, shape=[None, CAR_SENSORS], name='CarX')
# Reward + GAMMA * QMAX[next_state]
car_q = tf.placeholder(tf.float32, shape=[None, 1], name='CarQ')
# Action taken in input state 
car_action = tf.placeholder(tf.int32, shape=[None], name='CarAction')
car_net = get_net(car_x, car_q, car_action, CAR_SENSORS, 'CarNet')

if use_red_team:
    # Input state
    cat_x = tf.placeholder(tf.float32, shape=[None, CAT_SENSORS], name='CatX')
    # Reward + GAMMA * QMAX[next_state]
    cat_q = tf.placeholder(tf.float32, shape=[None, 1], name='CatQ')
    cat_action = tf.placeholder(tf.int32, shape=[None], name='CatAction')
    cat_net = get_net(cat_x, cat_q, cat_action, CAT_SENSORS, 'CatNet')

# Op to write logs to Tensorboard
summary_writer = tf.train.SummaryWriter(get_experiment_dir(), graph=tf.get_default_graph())    

with tf.Session() as sess:            
    # Initialize all variables.
    # NOTE : This should be done after defining everything.
    init = tf.initialize_all_variables()
    sess.run(init)
        
    # Train Model
    train_model(car_net, cat_net, sess)
    
#     summary_writer.flush()
#     summary_writer.close()

Tensor("CarNetL1/activation:0", shape=(?, 164), dtype=float32)
Tensor("CarNetL2/activation:0", shape=(?, 150), dtype=float32)
Tensor("CarNetL3/Wx_plus_b/add:0", shape=(?, 3), dtype=float32)
Tensor("CarNetOneHotAction:0", shape=(?, 3), dtype=float32)
Tensor("CarNetModel/PredictedQVal:0", shape=(?, 1), dtype=float32)
Tensor("CarNetLoss/QDiff:0", shape=(?, 1), dtype=float32)
Tensor("CarNetLoss/BatchLoss:0", shape=(), dtype=float32)
Tensor("CatNetL1/activation:0", shape=(?, 164), dtype=float32)
Tensor("CatNetL2/activation:0", shape=(?, 150), dtype=float32)
Tensor("CatNetL3/Wx_plus_b/add:0", shape=(?, 3), dtype=float32)
Tensor("CatNetOneHotAction:0", shape=(?, 3), dtype=float32)
Tensor("CatNetModel/PredictedQVal:0", shape=(?, 1), dtype=float32)
Tensor("CatNetLoss/QDiff:0", shape=(?, 1), dtype=float32)
Tensor("CatNetLoss/BatchLoss:0", shape=(), dtype=float32)


NameError: global name 'carmunk' is not defined

In [32]:
show_graph(tf.get_default_graph().as_graph_def())