<a href="https://colab.research.google.com/github/jiedali/colab_temp/blob/main/breakout_run8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import os
import time
import numpy as np
%tensorflow_version 1.x
import tensorflow as tf
print(tf.__version__)
import gym
import random
from collections import deque
import matplotlib.pyplot as plt
# choose a GPU card
# os.environ['CUDA_VISIBLE_DEVICES']="0"
# Set seed for tensorflow
# SEED=123
# tf.set_random_seed(SEED)
# GYM_SEED=678

1.15.2


In [9]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Thu Nov 19 21:26:32 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    23W / 300W |      0MiB / 16130MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
def preprocess_observation(image):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 2D float array """
    image = image[35:195] # crop
    image = image[::2,::2,0] # downsample by factor of 2
    image[image == 144] = 0 # erase background (background type 1)
    image[image == 109] = 0 # erase background (background type 2)
    image[image != 0] = 1 # everything else just set to 1
    return np.reshape(image.astype(np.float).ravel(), [80,80,1])

In [17]:
eps_min = 0.1
eps_max = 1.0
eps_decay_steps = 2000000
n_outputs = 4

replay_memory_size = 500000
replay_memory = deque([], maxlen=replay_memory_size)


def sample_memories(batch_size):
    indices = np.random.permutation(len(replay_memory))[:batch_size]
    cols = [[], [], [], [], []]
    for idx in indices:
        memory = replay_memory[idx]
        for col, value in zip(cols, memory):
            col.append(value)
    cols = [np.array(col) for col in cols]
    return cols[0], cols[1], cols[2].reshape(-1, 1), cols[3], cols[4].reshape(-1, 1)


def epsilon_greedy(q_values, step):
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        return np.argmax(q_values)

In [25]:
class DQN:
    def __init__(self):
        self.input_height = 80
        self.input_width = 80
        self.input_channels = 1
        self.conv_n_maps = [32, 64, 64]
        self.conv_kernel_sizes = [(8, 8), (4, 4), (3, 3)]
        self.conv_strides = [4, 2, 1]
        self.conv_paddings = ["SAME"] * 3
        self.conv_activation = [tf.nn.relu] * 3
        # The output from conv3 layer, is 64 filters, and the resulting shape for each filter is 10*10 (given by the SAME padding)
        self.n_hidden_in = 64 * 10 * 10
        self.n_hidden = 512
        self.hidden_activation = tf.nn.relu
        self.n_outputs = 4
        self.initializer = tf.contrib.layers.variance_scaling_initializer()

    def _zipped_params(self):
        return zip(self.conv_n_maps, self.conv_kernel_sizes,
                   self.conv_strides, self.conv_paddings, self.conv_activation)

    def create_model(self, state, name):
        prev_layer = state / 128.0
        with tf.variable_scope(name) as scope:
            for n_maps, kernel_size, strides, padding, activation in self._zipped_params():
                prev_layer = tf.layers.conv2d(prev_layer, filters=n_maps, kernel_size=kernel_size,
                                              strides=strides, padding=padding, activation=activation,
                                              kernel_initializer=self.initializer)

            last_conv_layer_flat = tf.reshape(prev_layer, shape=[-1, self.n_hidden_in])
            hidden = tf.layers.dense(last_conv_layer_flat, self.n_hidden, activation=self.hidden_activation,
                                     kernel_initializer=self.initializer)
            outputs = tf.layers.dense(hidden, self.n_outputs, kernel_initializer=self.initializer)

        trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)
        trainable_vars_by_name = {var.name[len(scope.name):]: var for var in trainable_vars}

        return outputs, trainable_vars_by_name

In [None]:
tf.reset_default_graph()
input_height = 80
input_width = 80
input_channels = 1
n_outputs = 4

learning_rate = 0.001
momentum = 0.95

n_steps = 4000000
training_start = 10000
training_interval = 4
save_steps = 1000
# changed copy_steps from 10000 to 2500
copy_steps = 2500
discount_rate = 0.99
skip_start = 90
batch_size = 50
checkpoint_path = "./breakout_run8.ckpt"


def train_model():
    iteration = 0
    loss_val = np.infty
    game_length = 0
    total_max_q = 0
    mean_max_q = 0.0
    done = True
    state = []
    final_mean_max_q=[]

    dqn = DQN()
    env = gym.make("Breakout-v0")

    X_state = tf.placeholder(tf.float32, shape=[None, input_height, input_width, input_channels])

    online_q_values, online_vars = dqn.create_model(X_state, "qnetwork_online")
    target_q_values, target_vars = dqn.create_model(X_state, "qnetwork_target")

    copy_ops = [target_var.assign(online_vars[var_name])
                for var_name, target_var in target_vars.items()]
    copy_online_to_target = tf.group(*copy_ops)

    X_action, global_step, loss, training_op, y = define_train_variables(online_q_values)

    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

    with tf.Session() as sess:

        restore_session(copy_online_to_target, init, saver, sess)

        while True:
            step = global_step.eval()
            if step >= n_steps:
                break

            iteration += 1
            print("\rIteration {}\tTraining step {}/{} ({:.1f})%\tLoss {:5f}\tMean Max-Q {:5f}   ".format(
                iteration, step, n_steps, step * 100 / n_steps, loss_val, mean_max_q), end="")
            


            state = skip_some_steps(done, env, state)

            done, q_values, next_state = evaluate_and_play_online_dqn(X_state, env, online_q_values, state, step)
            state = next_state

            mean_max_q = compute_statistics(done, game_length, mean_max_q, q_values, total_max_q)

            if done:
                # once an episode is done, save the results to a list
                final_mean_max_q.append(mean_max_q)

            if iteration < training_start or iteration % training_interval != 0:
                continue

            loss_val = train_online_dqn(X_action, X_state, loss, sess, target_q_values, training_op, y)

            # Copy the online DQN to the target DQN
            if step % copy_steps == 0:
                copy_online_to_target.run()

            # Save model
            if step % save_steps == 0:
                saver.save(sess, checkpoint_path)
            
            # save mean max q
            if step % save_steps == 0:
                with open('breakout_run8_mean_max_q.txt', 'w') as file:              
                  file.write('%s\n' % final_mean_max_q)
				            




def define_train_variables(online_q_values):
    with tf.variable_scope("train"):
        X_action = tf.placeholder(tf.int32, shape=[None])
        y = tf.placeholder(tf.float32, shape=[None, 1])
        q_value = tf.reduce_sum(online_q_values * tf.one_hot(X_action, n_outputs),
                                axis=1, keepdims=True)
        error = tf.abs(y - q_value)
        clipped_error = tf.clip_by_value(error, 0.0, 1.0)
        linear_error = 2 * (error - clipped_error)
        loss = tf.reduce_mean(tf.square(clipped_error) + linear_error)

        global_step = tf.Variable(0, trainable=False, name='global_step')
        optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
        training_op = optimizer.minimize(loss, global_step=global_step)
    return X_action, global_step, loss, training_op, y


def restore_session(copy_online_to_target, init, saver, sess):
    if os.path.isfile(checkpoint_path + ".index"):
        saver.restore(sess, checkpoint_path)
        print("restored session")
    else:
        init.run()
        copy_online_to_target.run()
        print("created a new session")


def skip_some_steps(done, env, state):
    if done:
        obs = env.reset()
        for skip in range(skip_start):
            obs, reward, done, info = env.step(0)
        state = preprocess_observation(obs)
    return state


def evaluate_and_play_online_dqn(X_state, env, online_q_values, state, step):
    # evaluate what to do
    q_values = online_q_values.eval(feed_dict={X_state: [state]})
    action = epsilon_greedy(q_values, step)

    # play the game
    obs, reward, done, info = env.step(action)
    next_state = preprocess_observation(obs)

    # memorize whats happened
    replay_memory.append((state, action, reward, next_state, 1.0 - done))

    return done, q_values, next_state


def compute_statistics(done, game_length, mean_max_q, q_values, total_max_q):
    total_max_q += q_values.max()
    game_length += 1
    if done:
        mean_max_q = total_max_q / game_length
    return mean_max_q


def train_online_dqn(X_action, X_state, loss, sess, target_q_values, training_op, y):
    # Sample memories and use the target DQN to produce the target Q-Value
    X_state_val, X_action_val, rewards, X_next_state_val, continues = (sample_memories(batch_size))
    next_q_values = target_q_values.eval(feed_dict={X_state: X_next_state_val})
    max_next_q_values = np.max(next_q_values, axis=1, keepdims=True)
    y_val = rewards + continues * discount_rate * max_next_q_values

    # Train the online DQN
    _, loss_val = sess.run([training_op, loss], feed_dict={X_state: X_state_val,
                                                           X_action: X_action_val,
                                                           y: y_val})
    return loss_val


if __name__ == '__main__':
    train_model()

created a new session
Iteration 113220	Training step 25805/4000000 (0.6)%	Loss 0.000188	Mean Max-Q 0.095707   