In [0]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1024, 768))
display.start()
import os
os.environ["DISPLAY"] = ":" + str(display.display) + "." + str(display.screen)

# DQN on GridWorld
**Important!! ** Before running the following cell, make sure rllab is set up properly in your **current** runtime by executing codes in **day1_rllab_0_setup.ipynb** 

Also, when run for the first time, the code will exit without training, just creating the personal profile. If this happens, just run the code again.

In [5]:
from rllab.envs.grid_world_env import GridWorldEnv
from dqn.envs.proxy_gym_env import ProxyGymEnv
from dqn.misc.retro_wrappers import wrap_deepmind_retro
from dqn.policies.categorical_mlp_q_policy import CategoricalMlpQPolicy
from dqn.exploration_strategies.eps_greedy_strategy import EpsilonGreedyStrategy

from dqn.algos.dqn import DQN

from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import run_experiment_lite
from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction

import lasagne.nonlinearities as NL


def run_task(*_):
    env = GridWorldEnv(desc='chain')
    
    policy = CategoricalMlpQPolicy(
        name='dqn_policy',
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=[],
    )
    
    n_steps = 40000
    es = EpsilonGreedyStrategy(env_spec=env.spec, max_eps=0.5, min_eps=0.05, decay_period=n_steps//2)
    
    algo = DQN(
        env=env,
        policy=policy,
        es=es,
        n_steps=n_steps,
        min_pool_size=50,
        replay_pool_size=100,
        train_epoch_interval=1000,
        max_path_length=50,
        policy_update_method='sgd',
        policy_learning_rate=0.2,
        target_model_update=0.5,
        n_eval_samples=0,
        batch_size=10,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()

# run_task()
  
   
run_experiment_lite(
    run_task,
    log_dir='./gridworld_dqn',
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
    # plot=True,
)



python /content/scripts/run_experiment_lite.py  --n_parallel '1'  --snapshot_mode 'last'  --seed '1'  --exp_name 'experiment_2019_01_07_11_17_57_0002'  --log_dir './gridworld_dqn'  --use_cloudpickle 'True'  --args_data 'gASVJAQAAAAAAACMF2Nsb3VkcGlja2xlLmNsb3VkcGlja2xllIwOX2ZpbGxfZnVuY3Rpb26Uk5QoaACMD19tYWtlX3NrZWxfZnVuY5STlGgAjA1fYnVpbHRpbl90eXBllJOUjAhDb2RlVHlwZZSFlFKUKEsASwBLBksPS0dDYnQAZAFkAo0BfQF0AWQDfAFqAmcAZASNA30CZAV9A3QDfAFqAmQGZAd8A2QIGgBkCY0EfQR0BHwBfAJ8BHwDZApkC2QMZApkDWQOZAZkD2QQZBGNDX0FfAVqBYMAAQBkAFMAlChOjAVjaGFpbpSMBGRlc2OUhZSMCmRxbl9wb2xpY3mUjARuYW1llIwIZW52X3NwZWOUjAxoaWRkZW5fc2l6ZXOUh5RNQJxHP+AAAAAAAABHP6mZmZmZmZpLAihoEIwHbWF4X2Vwc5SMB21pbl9lcHOUjAxkZWNheV9wZXJpb2SUdJRLMktkTegDjANzZ2SURz/JmZmZmZmaSwBLCiiMA2VudpSMBnBvbGljeZSMAmVzlIwHbl9zdGVwc5SMDW1pbl9wb29sX3NpemWUjBByZXBsYXlfcG9vbF9zaXpllIwUdHJhaW5fZXBvY2hfaW50ZXJ2YWyUjA9tYXhfcGF0aF9sZW5ndGiUjBRwb2xpY3lfdXBkYXRlX21ldGhvZJSMFHBvbGljeV9sZWFybmluZ19yYXRllIwTdGFyZ2V0X21vZGVsX3VwZGF0ZZSMDm5fZXZhbF9zYW1wbGVzlIwKYmF0Y2hfc2l6

# DQN on CartPole (OpenAI Gym version)

**Important!! ** Before running the following cell, make sure rllab is set up properly in your **current** runtime by executing codes in **day1_rllab_0_setup.ipynb** 

Also, when run for the first time, the code will exit without training, just creating the personal profile. If this happens, just run the code again.

In [3]:
from rllab.envs.gym_env import GymEnv
from dqn.policies.categorical_mlp_q_policy import CategoricalMlpQPolicy
from dqn.exploration_strategies.eps_greedy_strategy import EpsilonGreedyStrategy

from dqn.algos.dqn import DQN

from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import run_experiment_lite
from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction

import lasagne.nonlinearities as NL


def run_task(*_):
    env = GymEnv('CartPole-v0', record_video=False)
    
    policy = CategoricalMlpQPolicy(
        name='dqn_policy',
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=[64],
        hidden_nonlinearity=NL.rectify
    )
    
    n_steps = 80000
    es = EpsilonGreedyStrategy(env_spec=env.spec, max_eps=0.5, min_eps=0.05, decay_period=n_steps//4)
    
    algo = DQN(
        env=env,
        policy=policy,
        es=es,
        n_steps=n_steps,
        min_pool_size=100,
        replay_pool_size=200,
        train_epoch_interval=1000,
        max_path_length=200,
        policy_update_method='sgd',
        policy_learning_rate=0.0005,
        target_model_update=0.5,
        n_eval_samples=0,
        batch_size=20,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()

# run_task()
 
  
run_experiment_lite(
    run_task,
    log_dir='./cartpole_dqn',
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
    # plot=True,
)

python /content/scripts/run_experiment_lite.py  --n_parallel '1'  --snapshot_mode 'last'  --seed '1'  --exp_name 'experiment_2019_01_07_11_17_57_0001'  --log_dir './cartpole_dqn'  --use_cloudpickle 'True'  --args_data 'gASViQQAAAAAAACMF2Nsb3VkcGlja2xlLmNsb3VkcGlja2xllIwOX2ZpbGxfZnVuY3Rpb26Uk5QoaACMD19tYWtlX3NrZWxfZnVuY5STlGgAjA1fYnVpbHRpbl90eXBllJOUjAhDb2RlVHlwZZSFlFKUKEsASwBLBksPS0dDanQAZAFkAmQDjQJ9AXQBZAR8AWoCZAVnAXQDagRkBo0EfQJkB30DdAV8AWoCZAhkCXwDZAoaAGQLjQR9BHQGfAF8AnwEfANkDGQNZA5kDWQPZBBkCGQRZBJkE40NfQV8BWoHgwABAGQAUwCUKE6MC0NhcnRQb2xlLXYwlImMDHJlY29yZF92aWRlb5SFlIwKZHFuX3BvbGljeZRLQCiMBG5hbWWUjAhlbnZfc3BlY5SMDGhpZGRlbl9zaXplc5SME2hpZGRlbl9ub25saW5lYXJpdHmUdJRKgDgBAEc/4AAAAAAAAEc/qZmZmZmZmksEKGgQjAdtYXhfZXBzlIwHbWluX2Vwc5SMDGRlY2F5X3BlcmlvZJR0lEtkS8hN6AOMA3NnZJRHP0BiTdLxqfxLAEsUKIwDZW52lIwGcG9saWN5lIwCZXOUjAduX3N0ZXBzlIwNbWluX3Bvb2xfc2l6ZZSMEHJlcGxheV9wb29sX3NpemWUjBR0cmFpbl9lcG9jaF9pbnRlcnZhbJSMD21heF9wYXRoX2xlbmd0aJSMFHBvbGljeV91cGRhdGVfbWV0aG9klIwUcG9saWN5X2xlYXJuaW5nX3JhdGWUj

# DQN on Breakout



In [0]:
from dqn.envs.proxy_gym_env import ProxyGymEnv
from dqn.misc.retro_wrappers import wrap_deepmind_retro
from dqn.policies.categorical_conv_q_policy import CategoricalConvQPolicy
from dqn.exploration_strategies.eps_greedy_strategy import EpsilonGreedyStrategy

from dqn.algos.dqn import DQN

from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import stub, run_experiment_lite
from rllab.exploration_strategies.ou_strategy import OUStrategy
from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction

def run_task(*_):
    game = wrap_deepmind_retro(gym.envs.make('BreakoutDeterministic-v4'))

    env = ProxyGymEnv(game, record_video=False, record_log=False)
    
    policy = CategoricalConvQPolicy(
        name='dqn_policy',
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        conv_filters=[32, 64, 64], 
        conv_filter_sizes=[8,4,3], 
        conv_strides=[4,2,1], 
        conv_pads=['valid','valid','valid'],
        hidden_sizes=[512]
    )
    
    n_steps = 10000000 
    es = EpsilonGreedyStrategy(env_spec=env.spec, max_eps=0.5, min_eps=0.05, decay_period=n_steps//2)
    
    algo = DQN(
        env=env,
        policy=policy,
        es=es,
        n_steps=n_steps,
        min_pool_size         =   1000,
        replay_pool_size      =  50000,
        train_epoch_interval  =  10000,
        # max_path_length=np.max,
        policy_update_method='sgd',
        policy_learning_rate=0.005, # needs to be lower...
        target_model_update=0.5,
        n_eval_samples=0,
        batch_size=32,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()

# run_task()

 
run_experiment_lite(
    run_task,
    log_dir='./breakout_dqn',
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
    # plot=True,
)

