In [155]:
import os.path
import numpy as np
import random
import timeit
import tensorflow as tf
# from flat_game import carmunk

In [156]:
GAMMA = 0.9
NUM_SENSORS, NUM_ACTIONS = 3, 3
N1, N2 = 164, 150
LEARNING_RATE = 1e-3
SUMMARY_STEP = 1000

observe = 1000  # Number of frames to observe before training.
epsilon = 1
train_frames = 10000  # Number of frames to play.
batch_size = 100
buffer_size = 50000

In [157]:
# Taken from: https://goo.gl/SrpGW0
from IPython.display import clear_output, Image, display, HTML

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [158]:
# Taken from TensorFlow MNIST Tutorial
# https://goo.gl/gkZs36

# We can't initialize these variables to 0 - the network will get stuck.
def weight_variable(shape):
    """Create a weight variable with appropriate initialization."""
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    """Create a bias variable with appropriate initialization."""
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def variable_summaries(var):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.scalar_summary('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.scalar_summary('stddev', stddev)
        tf.scalar_summary('max', tf.reduce_max(var))
        tf.scalar_summary('min', tf.reduce_min(var))
        tf.histogram_summary('histogram', var)
        
def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
    """Reusable code for making a simple neural net layer.
    It does a matrix multiply, bias add, and then uses relu to nonlinearize.
    It also sets up name scoping so that the resultant graph is easy to read,
    and adds a number of summary ops.
    """
    # Adding a name scope ensures logical grouping of the layers in the graph.
    with tf.name_scope(layer_name):
        # This Variable will hold the state of the weights for the layer
        with tf.name_scope('weights'):
            weights = weight_variable([input_dim, output_dim])
            variable_summaries(weights)
        with tf.name_scope('biases'):
            biases = bias_variable([output_dim])
            variable_summaries(biases)
        with tf.name_scope('Wx_plus_b'):
            preactivate = tf.matmul(input_tensor, weights) + biases
            tf.histogram_summary('pre_activations', preactivate)
        if act is not None:
            activations = act(preactivate, name='activation')
            tf.histogram_summary('activations', activations)
            return activations
        return preactivate

In [159]:
def get_linear_model():
    # Input state
    x = tf.placeholder(tf.float32, shape=[None, NUM_SENSORS], name='X')
    
    # Neural network
    l1 = nn_layer(x, NUM_SENSORS, N1, 'L1')
    print l1
    l2 = nn_layer(l1, N1, N2, 'L2')
    print l2
    l3 = nn_layer(l2, N2, NUM_ACTIONS, 'L3', act=None)
    print l3
    
    # Reward + GAMMA * QMAX[next_state]
    q = tf.placeholder(tf.float32, shape=[None, 1], name='X')
    # Action taken in input state 
    action = tf.placeholder(tf.int32, shape=[None], name='Action')
    one_hot_action = tf.one_hot(action, depth=NUM_ACTIONS, name='OneHotAction')
    print one_hot_action
    
    # Model
    with tf.name_scope('Model'):
        q_pred = tf.reduce_sum(l3 * one_hot_action, 1, keep_dims=True, name='PredictedQVal')
        print q_pred

    # Loss
    with tf.name_scope('Loss'):
        q_diff = tf.sub(q, q_pred, name='QDiff')
        print q_diff
        loss = tf.reduce_sum(tf.square(q_diff, name='SquaredLoss'), name='BatchLoss')
        print loss
        
    # Optimizer
    with tf.name_scope('Optimizer'):
        optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)
    
    # Summaries
    with tf.name_scope('Summaries'):
        # Create a summary to monitor cost tensor
        tf.scalar_summary("loss", loss)
        # Merge all summaries into a single op
        summary = tf.merge_all_summaries()
    
    return q_pred, optimizer, loss, summary

In [160]:
def get_random_action():
    return np.random.randint(0, NUM_ACTIONS)  # random
    
def get_epsilon_greedy_action(epsilon, state, optimizer, sess):
    # Choose an action.
    if random.random() < epsilon:
        return get_random_action()
    else:
        # Get Q values for each action.
        qval = sess.run(optimizer, feed_dict={x: state})
        action = (np.argmax(qval))  # best
        
def get_minibatch(replay, model):
    # Randomly sample our experience replay memory
    minibatch = random.sample(replay, batch_size)
    # Get training values.
    return process_minibatch(minibatch, model)
        
def train_batch(X_train, y_train, train_actions, optimizer, sess):
    # Train the model on this batch.
    sess.run(optimizer, feed_dict={x: X_train, q: y_train, actions: train_actions})
    
def log_summary(X_train, y_train, train_actions, summary, sess):
    # Log summery for this batch
    s = sess.run(summary, feed_dict={x: X_train, q: y_train, actions: train_actions})
    summary_writer.add_summary(s)

# Mostly adopted from learning.py
def train_model(model, optimizer, summary, sess):
    # Just stuff used below.
    max_car_distance, car_distance, t = 0, 0, 0
    data_collect, relay = [], []

    # Create a new game instance.
    game_state = carmunk.GameState()

    # Get initial state by doing nothing and getting the state.
    _, car_state, cat_state = game_state.frame_step((2))

    # Let's time it.
    start_time = timeit.default_timer()

    # Run the frames.
    while t < train_frames:
        t += 1
        car_distance += 1
    
        if t < observe:
            action = get_random_action
        else:
            action = get_epsilon_greedy_action(epsilon, car_state, model)

        # Take action, observe new state and get our treat.
        car_reward, _, new_car_state, _ = game_state.frame_step(action)

        # Experience replay storage.
        replay.append((car_state, action, car_reward, new_car_state))

        # If we're done observing, start training.
        if t > observe:
            # If we've stored enough in our buffer, pop the oldest.
            if len(replay) > buffer_size:
                replay.pop(0)
            X_train, y_train, train_actions = get_minibatch(replay, model)
            train_batch(X_train, y_train, train_actions, optimizer, sess)
            if t % SUMMARY_STEP == 0:
                log_summary(X_train, y_train, train_actions, summary, sess)
            
        # Update the starting state with S'.
        car_state = new_car_state

        # Decrement epsilon over time.
        if epsilon > 0.1 and t > observe:
            epsilon -= (1/train_frames)

        # We died, so update stuff.
        if car_reward == -500:
            # Log the car's distance at this T.
            data_collect.append([t, car_distance])

            # Update max.
            if car_distance > max_car_distance:
                max_car_distance = car_distance

            # Time it.
            tot_time = timeit.default_timer() - start_time
            fps = car_distance / tot_time

            # Output some stuff so we can watch.
            print("Max: %d at %d\tepsilon %f\t(%d)\t%f fps" %
                  (max_car_distance, t, epsilon, car_distance, fps))

            # Reset.
            car_distance = 0
            start_time = timeit.default_timer()

In [161]:
def process_minibatch(minibatch, model):
    X_train, y_train = [], []
    # Loop through our batch and create arrays for X and y
    # so that we can fit our model at every step.
    for record in minibatch:
        # Get stored values.
        old_state, action, reward, new_state = record
        # Get prediction on new state.
        new_qval = sess.run(model, feed_dict={x: new_state})
        # Get best move for new state.
        max_qval = np.max(new_qval)
        y = np.zeros((1, NUM_ACTIONS))
        # Check for terminal state.
        if reward != -500:  # non-terminal state
            update = (reward + (GAMMA * max_qval))
        else:  # terminal state
            update = reward
        X_train.append(old_state.reshape(NUM_SENSORS,))
        y_train.append(np.array(update))
        y_train_actions.append(np.array(action))

    X_train = tf.pack(X_train)
    y_train = tf.pack(y_train)
    return X_train, y_train, y_train_actions

In [162]:
tf.reset_default_graph()
model, optimizer, loss, summary  = get_linear_model()
show_graph(tf.get_default_graph().as_graph_def())

Tensor("L1/activation:0", shape=(?, 164), dtype=float32)
Tensor("L2/activation:0", shape=(?, 150), dtype=float32)
Tensor("L3/Wx_plus_b/add:0", shape=(?, 3), dtype=float32)
Tensor("OneHotAction:0", shape=(?, 3), dtype=float32)
Tensor("Model/PredictedQVal:0", shape=(?, 1), dtype=float32)
Tensor("Loss/QDiff:0", shape=(?, 1), dtype=float32)
Tensor("Loss/BatchLoss:0", shape=(), dtype=float32)


In [164]:
with tf.Session() as sess:
    # Build model
    tf.reset_default_graph()
    optimizer, loss, summary  = get_linear_model()
        
    # Op to write logs to Tensorboard
    summary_writer = tf.train.SummaryWriter(LOGS_DIR, graph=tf.get_default_graph())
    
    # Initialize all variables.
    # NOTE : This should be done after defining everything.
    init = tf.initialize_all_variables()
    sess.run(init)
        
    # Train Model
    train_model(model, optimizer, summary, sess)
    
#     summary_writer.flush()
#     summary_writer.close()

Tensor("L1/activation:0", shape=(?, 164), dtype=float32)
Tensor("L2/activation:0", shape=(?, 150), dtype=float32)
Tensor("L3/Wx_plus_b/add:0", shape=(?, 3), dtype=float32)
Tensor("OneHotAction:0", shape=(?, 3), dtype=float32)
Tensor("Model/PredictedQVal:0", shape=(?, 1), dtype=float32)
Tensor("Loss/QDiff:0", shape=(?, 1), dtype=float32)
Tensor("Loss/BatchLoss:0", shape=(), dtype=float32)


IndexError: list index out of range