<a href="https://colab.research.google.com/github/kldfznmsg/colaboratory_backup/blob/main/A3C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 安装运行环境

### 挂载Google云盘

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


### 基础环境安装

In [None]:
#选择TensorFlow版本
%tensorflow_version 1.x

#安装游戏
!pip install gym[atari] > /dev/null 2>&1 

#安装环境依赖
!apt-get install x11-utils > /dev/null 2>&1 
!pip install pyglet > /dev/null 2>&1 
!apt-get install -y xvfb python-opengl > /dev/null 2>&1

#安装虚拟界面
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!python -m atari_py.import_roms /content/gdrive/MyDrive/prj/data/games

### 下载Atari游戏包（仅第一次的时候运行）

In [None]:
#安装atari游戏
!pip install atari-py

#下载解压、加载游戏列表
!wget -P /content/gdrive/MyDrive/prj/data http://www.atarimania.com/roms/Roms.rar
!unrar e /content/gdrive/MyDrive/prj/data/Roms.rar /content/gdrive/MyDrive/prj/data/games
!python -m atari_py.import_roms /content/gdrive/MyDrive/prj/data/games

# Train

## 在jupter里训练

### 加载Atari环境

In [None]:
import tensorflow as tf
from skimage.transform import resize
from skimage.color import rgb2gray
import numpy as np
from collections import deque

class AtariEnvironment(object):
    """
    Small wrapper for gym atari environments.
    Responsible for preprocessing screens and holding on to a screen buffer 
    of size agent_history_length from which environment state
    is constructed.
    """
    def __init__(self, gym_env, resized_width, resized_height, agent_history_length):
        self.env = gym_env
        self.resized_width = resized_width
        self.resized_height = resized_height
        self.agent_history_length = agent_history_length

        self.gym_actions = range(gym_env.action_space.n)
        if (gym_env.spec.id == "Pong-v0" or gym_env.spec.id == "Breakout-v0"):
            print ("Doing workaround for pong or breakout")
            # Gym returns 6 possible actions for breakout and pong.
            # Only three are used, the rest are no-ops. This just lets us
            # pick from a simplified "LEFT", "RIGHT", "NOOP" action space.
            self.gym_actions = [1,2,3]

        # Screen buffer of size AGENT_HISTORY_LENGTH to be able
        # to build state arrays of size [1, AGENT_HISTORY_LENGTH, width, height]
        self.state_buffer = deque()

    def get_initial_state(self):
        """
        Resets the atari game, clears the state buffer
        """
        # Clear the state buffer
        self.state_buffer = deque()

        x_t = self.env.reset()
        x_t = self.get_preprocessed_frame(x_t)
        s_t = np.stack((x_t, x_t, x_t, x_t), axis = 0)
        
        for i in range(self.agent_history_length-1):
            self.state_buffer.append(x_t)
        return s_t

    def get_preprocessed_frame(self, observation):
        """
        See Methods->Preprocessing in Mnih et al.
        1) Get image grayscale
        2) Rescale image
        """
        return resize(rgb2gray(observation), (self.resized_width, self.resized_height))

    def step(self, action_index):
        """
        Excecutes an action in the gym environment.
        Builds current state (concatenation of agent_history_length-1 previous frames and current one).
        Pops oldest frame, adds current frame to the state buffer.
        Returns current state.
        """

        x_t1, r_t, terminal, info = self.env.step(self.gym_actions[action_index])
        x_t1 = self.get_preprocessed_frame(x_t1)

        previous_frames = np.array(self.state_buffer)
        s_t1 = np.empty((self.agent_history_length, self.resized_height, self.resized_width))
        s_t1[:self.agent_history_length-1, ...] = previous_frames
        s_t1[self.agent_history_length-1] = x_t1

        # Pop the oldest frame, add the current frame to the queue
        self.state_buffer.popleft()
        self.state_buffer.append(x_t1)

        return s_t1, r_t, terminal, info


###加载A3C模型

In [None]:
import tensorflow as tf
from keras import backend as K
from keras.layers import Convolution2D, Flatten, Dense, Input
from keras.models import Model

def build_policy_and_value_networks(num_actions, agent_history_length, resized_width, resized_height):
    with tf.device("/cpu:0"):
        state = tf.placeholder("float", [None, agent_history_length, resized_width, resized_height])

        #共同训练的部分:输入层-卷积层1-卷积层2-Flatten-relu
        inputs = Input(shape=(agent_history_length, resized_width, resized_height,))
        shared = Convolution2D(name="conv1", nb_filter=16, nb_row=8, nb_col=8, subsample=(4,4), activation='relu', border_mode='same')(inputs)
        shared = Convolution2D(name="conv2", nb_filter=32, nb_row=4, nb_col=4, subsample=(2,2), activation='relu', border_mode='same')(shared)
        shared = Flatten()(shared)
        shared = Dense(name="h1", output_dim=256, activation='relu')(shared)

        #训练actor
        action_probs = Dense(name="p", output_dim=num_actions, activation='softmax')(shared)

        #训练critic
        state_value = Dense(name="v", output_dim=1, activation='linear')(shared)

        policy_network = Model(input=inputs, output=action_probs)
        value_network = Model(input=inputs, output=state_value)

        p_params = policy_network.trainable_weights
        v_params = value_network.trainable_weights

        p_out = policy_network(state)
        v_out = value_network(state)
    print("a3c_model")
    return state, p_out, v_out, p_params, v_params

Using TensorFlow backend.


### A3C主函数训练（a3c.py）

加载相关函数

In [None]:
#!/usr/bin/env python
from skimage.transform import resize
from skimage.color import rgb2gray
import threading
import tensorflow as tf
import sys
import random
import numpy as np
import time
import gym
from keras import backend as K
from keras.layers import Convolution2D, Flatten, Dense
from collections import deque
#from a3c_model import build_policy_and_value_networks
from keras import backend as K
#from atari_environment import AtariEnvironment
from pyvirtualdisplay import Display
from PIL import Image
from IPython import display
import matplotlib.pyplot as plt


# Path params
EXPERIMENT_NAME = "breakout_a3c"
PRJ_ROOT_DIR = "/content/gdrive/MyDrive/prj/data"
SUMMARY_SAVE_PATH = PRJ_ROOT_DIR+"/summaries/"+EXPERIMENT_NAME
CHECKPOINT_SAVE_PATH = "/tmp/"+EXPERIMENT_NAME+".ckpt"
CHECKPOINT_NAME = CHECKPOINT_SAVE_PATH+"-5"
CHECKPOINT_INTERVAL=5000
SUMMARY_INTERVAL=5
# TRAINING = False
TRAINING = True

#SHOW_TRAINING = True
SHOW_TRAINING = False

# Experiment params
GAME = "Breakout-v0"
ACTIONS = 3
NUM_CONCURRENT = 8
NUM_EPISODES = 20000

AGENT_HISTORY_LENGTH = 4
RESIZED_WIDTH = 84
RESIZED_HEIGHT = 84

# DQN Params
GAMMA = 0.99

# Optimization Params
LEARNING_RATE = 0.00001

#Shared global parameters
T = 0
TMAX = 80000000
t_max = 32

def start_pyvirtual_screen():
  _display = Display(visible=0, size=(640,480))
  _display.start()


def render_env(env,mode='virtual'):
  #渲染游戏界面
  if mode=='local':
    env.render()
  elif mode=='virtual':
    plt.imshow(env.render(mode='rgb_array'))
    display.display(plt.gcf())
    display.clear_output(wait=True)
  else:
    print("Param 'mode' in Function 'render_env' need be 'local'or'virtual'")


def sample_policy_action(num_actions, probs):
    """
    Sample an action from an action probability distribution output by
    the policy network.
    """
    # Subtract a tiny value from probabilities in order to avoid
    # "ValueError: sum(pvals[:-1]) > 1.0" in numpy.multinomial
    probs = probs - np.finfo(np.float32).epsneg

    histogram = np.random.multinomial(1, probs)
    action_index = int(np.nonzero(histogram)[0])
    return action_index

def actor_learner_thread(num, env, session, graph_ops, summary_ops, saver):
    # We use global shared counter T, and TMAX constant
    global TMAX, T

    # Unpack graph ops
    s, a, R, minimize, p_network, v_network = graph_ops

    # Unpack tensorboard summary stuff
    r_summary_placeholder, update_ep_reward, val_summary_placeholder, update_ep_val, summary_op = summary_ops

    # Wrap env with AtariEnvironment helper class
    env_game=env
    env = AtariEnvironment(gym_env=env, resized_width=RESIZED_WIDTH, resized_height=RESIZED_HEIGHT, agent_history_length=AGENT_HISTORY_LENGTH)

    time.sleep(5*num)

    # Set up per-episode counters
    ep_reward = 0
    ep_avg_v = 0
    v_steps = 0
    ep_t = 0

    probs_summary_t = 0

    s_t = env.get_initial_state()
    terminal = False

    while T < TMAX:
        s_batch = []
        past_rewards = []
        a_batch = []

        t = 0
        t_start = t

        while not (terminal or ((t - t_start)  == t_max)):
            # Perform action a_t according to policy pi(a_t | s_t)
            probs = session.run(p_network, feed_dict={s: [s_t]})[0]
            action_index = sample_policy_action(ACTIONS, probs)
            a_t = np.zeros([ACTIONS])
            a_t[action_index] = 1

            if probs_summary_t % 100 == 0:
                print("THREAD:", num,"P, ", np.max(probs), "V ", session.run(v_network, feed_dict={s: [s_t]})[0][0])
                

            s_batch.append(s_t)
            a_batch.append(a_t)

            s_t1, r_t, terminal, info = env.step(action_index)
            ep_reward += r_t

            r_t = np.clip(r_t, -1, 1)
            past_rewards.append(r_t)

            t += 1
            T += 1
            ep_t += 1
            probs_summary_t += 1
            
            s_t = s_t1

        if terminal:
            R_t = 0
        else:
            R_t = session.run(v_network, feed_dict={s: [s_t]})[0][0] # Bootstrap from last state

        R_batch = np.zeros(t)
        for i in reversed(range(t_start, t)):
            R_t = past_rewards[i] + GAMMA * R_t
            R_batch[i] = R_t

        session.run(minimize, feed_dict={R : R_batch,
                                         a : a_batch,
                                         s : s_batch})
        
        # Save progress every 5000 iterations
        if T % CHECKPOINT_INTERVAL == 0:
            saver.save(session, CHECKPOINT_SAVE_PATH, global_step = T)

        if terminal:
            # Episode ended, collect stats and reset game
            session.run(update_ep_reward, feed_dict={r_summary_placeholder: ep_reward})
            print("THREAD:", num, "/ TIME", T, "/ REWARD", ep_reward)
            s_t = env.get_initial_state()
            terminal = False
            # Reset per-episode counters
            ep_reward = 0
            ep_t = 0

def build_graph():
    # Create shared global policy and value networks
    s, p_network, v_network, p_params, v_params = build_policy_and_value_networks(num_actions=ACTIONS, agent_history_length=AGENT_HISTORY_LENGTH, resized_width=RESIZED_WIDTH, resized_height=RESIZED_HEIGHT)

    # Shared global optimizer
    optimizer = tf.train.AdamOptimizer(LEARNING_RATE)

    # Op for applying remote gradients
    R_t = tf.placeholder("float", [None])
    a_t = tf.placeholder("float", [None, ACTIONS])
    log_prob = tf.log(tf.reduce_sum(p_network * a_t, reduction_indices=1))
    p_loss = -log_prob * (R_t - v_network)
    v_loss = tf.reduce_mean(tf.square(R_t - v_network))

    total_loss = p_loss + (0.5 * v_loss)

    minimize = optimizer.minimize(total_loss)
    return s, a_t, R_t, minimize, p_network, v_network

# Set up some episode summary ops to visualize on tensorboard.
def setup_summaries():
    episode_reward = tf.Variable(0.)
    tf.summary.scalar("Episode Reward", episode_reward)
    r_summary_placeholder = tf.placeholder("float")
    update_ep_reward = episode_reward.assign(r_summary_placeholder)
    ep_avg_v = tf.Variable(0.)
    tf.summary.scalar("Episode Value", ep_avg_v)
    val_summary_placeholder = tf.placeholder("float")
    update_ep_val = ep_avg_v.assign(val_summary_placeholder)
    summary_op = tf.summary.merge_all()
    return r_summary_placeholder, update_ep_reward, val_summary_placeholder, update_ep_val, summary_op

def train(session, graph_ops, saver):
    # Set up game environments (one per thread)
    envs = [gym.make(GAME) for i in range(NUM_CONCURRENT)]
    
    summary_ops = setup_summaries()
    summary_op = summary_ops[-1]

    # Initialize variables
    session.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(SUMMARY_SAVE_PATH, session.graph)

    # Start NUM_CONCURRENT training threads
    actor_learner_threads = [threading.Thread(target=actor_learner_thread, args=(thread_id, envs[thread_id], session, graph_ops, summary_ops, saver)) for thread_id in range(NUM_CONCURRENT)]
    for t in actor_learner_threads:
        t.start()

    # Show the agents training and write summary statistics
    last_summary_time = 0
    while True:
        if SHOW_TRAINING:
            for env in envs:
                render_env(env);
        now = time.time()
        if now - last_summary_time > SUMMARY_INTERVAL:
            summary_str = session.run(summary_op)
            writer.add_summary(summary_str, float(T))
            last_summary_time = now
    for t in actor_learner_threads:
        t.join()

def evaluation(session, graph_ops, saver):
    saver.restore(session, CHECKPOINT_NAME)
    print("Restored model weights from ", CHECKPOINT_NAME)
    monitor_env = gym.make(GAME)
    monitor_env.monitor.start('/tmp/'+EXPERIMENT_NAME+"/eval")

    # Unpack graph ops
    s, a_t, R_t, minimize, p_network, v_network = graph_ops

    # Wrap env with AtariEnvironment helper class
    env = AtariEnvironment(gym_env=monitor_env, resized_width=RESIZED_WIDTH, resized_height=RESIZED_HEIGHT, agent_history_length=AGENT_HISTORY_LENGTH)

    for i_episode in xrange(100):
        s_t = env.get_initial_state()
        ep_reward = 0
        terminal = False
        while not terminal:
            monitor_env.render()
            # Forward the deep q network, get Q(s,a) values
            probs = p_network.eval(session = session, feed_dict = {s : [s_t]})[0]
            action_index = sample_policy_action(ACTIONS, probs)
            s_t1, r_t, terminal, info = env.step(action_index)
            s_t = s_t1
            ep_reward += r_t
        print( ep_reward)
    monitor_env.monitor.close()


训练

In [None]:
def main(_):
  start_pyvirtual_screen(); #启动虚拟界面，colab无图形化，用matplotlib实现
  g = tf.Graph() #计算图，主要用于构建网络，本身不进行任何实际的计算。
  with g.as_default(), tf.Session() as session: #开始会画执行定义的操作
    K.set_session(session)
    graph_ops = build_graph()
    saver = tf.train.Saver()

    if TRAINING:
        train(session, graph_ops, saver)
    else:
        evaluation(session, graph_ops, saver)

if __name__ == "__main__":
  tf.app.run()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


  if sys.path[0] == '':
W0411 19:23:36.427436 139742985955200 deprecation.py:506] From /tensorflow-1.15.2/python3.7/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
  del sys.path[0]
  from ipykernel import kernelapp as app


a3c_model
INFO:tensorflow:Summary name Episode Reward is illegal; using Episode_Reward instead.


I0411 19:23:38.301537 139742985955200 summary_op_util.py:66] Summary name Episode Reward is illegal; using Episode_Reward instead.


INFO:tensorflow:Summary name Episode Value is illegal; using Episode_Value instead.


I0411 19:23:38.312760 139742985955200 summary_op_util.py:66] Summary name Episode Value is illegal; using Episode_Value instead.


Doing workaround for pong or breakout
Doing workaround for pong or breakout
Doing workaround for pong or breakout
Doing workaround for pong or breakout
Doing workaround for pong or breakout
Doing workaround for pong or breakout
Doing workaround for pong or breakout
Doing workaround for pong or breakout
THREAD: 0 P,  0.34499517 V  -0.025435545
THREAD: 0 P,  0.3477712 V  -0.009897483
THREAD: 0 P,  0.34853387 V  0.0051855575
THREAD: 1 P,  0.34774157 V  0.0018332974
THREAD: 0 P,  0.34800065 V  0.023341568
THREAD: 1 P,  0.3505816 V  0.026907165
THREAD: 0 / TIME 493 / REWARD 3.0
THREAD: 0 P,  0.35295513 V  0.042321336
THREAD: 2 P,  0.35265425 V  0.04531278
THREAD: 1 P,  0.35465932 V  0.045340646
THREAD: 1 / TIME 779 / REWARD 1.0
THREAD: 0 P,  0.3547118 V  0.07079646
THREAD: 2 P,  0.35475692 V  0.0723781
THREAD: 3 P,  0.35494345 V  0.07469487
THREAD: 0 / TIME 949 / REWARD 0.0
THREAD: 1 P,  0.35669133 V  0.07839729
THREAD: 2 / TIME 1168 / REWARD 0.0
THREAD: 2 P,  0.35875428 V  0.11202829
THREA

## 直接在工程里训练

In [None]:
#训练a3c模型
!python /content/drive/MyDrive/prj/a3c.py --experiment breakout --game "Breakout-v0" --num_concurrent 8

In [None]:
#训练异步dqn模型
!python /content/drive/MyDrive/prj/async_dqn.py --experiment breakout --game "Breakout-v0" --num_concurrent 8