Prioritized experience replay with random agent initialization

In [1]:
%matplotlib inline

from datetime import datetime
from imp import reload
import os
import random

from matplotlib import pyplot as plt
from numpy.random import seed as rng_seed
from smooth import smooth
import tensorflow as tf

import ddqn_per_class

  from ._conv import register_converters as _register_converters
Using Theano backend.


# Pin RNG

In [40]:
# https://machinelearningmastery.com/reproducible-results-neural-networks-keras/#comment-414394

rng_seed(0)

In [2]:
def benchmark(agent_class, num_runs = 3, random_seeds = (0, 100, 1000), env_name='CartPole-v1', **kwargs):
    assert len(random_seeds) == num_runs
    runs = []
    for idx in range(num_runs):  
        np.random.seed(random_seeds[idx])
        start = datetime.now()
        agent = agent_class(env_name, **kwargs)
        print(f'{start}')
        agent.run()
        end = datetime.now()
        data = {'instance': agent, 'total_time': (end-start).total_seconds()}
        runs.append(data)
        print(f'\nRun {idx} total time: {(end-start).total_seconds()/60:.2f} mins')        
    return runs

def print_benchmark(data):
    for run in data:
        agent = run['instance']
        time = run['total_time']
        for key, val in agent.__dict__.items():
            if isinstance(val, list):
                continue
            print(f'{key}: {val}')

        print(f'\nTotal training time: {time/60:.2f} minutes')
        print('---')    
        

def plot_timeseries(data, hline_at=100):
    plt.figure(figsize=(20, 10))
    for series in data:
        plt.plot(series, alpha=0.7)
    plt.hlines(hline_at, 0, len(series), linestyles='dotted', )
    plt.legend(list(range(0, len(data))))

In [5]:
agent = ddqn_per_class.PrioritizedExperienceReplayDDQN
benchmark(agent, render_every=0, save_every=50, update_target_every=300, num_episodes=3000)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Starting to initializing memory with random agent.
Finished initializing memory with random agent.
Saved network to data/2018-05-15__19--10--01/PER/CartPole-v1_agent.params
2018-05-15 19:10:01.260419
Saved data to data/2018-05-15__19--10--01/PER/history.data
Saved data to data/2018-05-15__19--10--01/PER/history.datad: 0.0, 100-episode mean reward: 0.0 
Saved data to data/2018-05-15__19--10--01/PER/history.dataeward: 10.3, 100-episode mean reward: 11.82 
Saved data to data/2018-05-15__19--10--01/PER/history.data
Saved data to data/2018-05-15__19--10--01/PER/history.dataward: 10.3, 100-episode mean reward: 11.41 
Saved data to data/2018-05-15__19--10--01/PER/history.dataeward: 10.1, 100-episode mean reward: 10.91 
Saved data to data/2018-05-15__19--10--01/PER/history.dataeward: 10.9, 100-episode mean reward: 10.56 
Saved data to data/2018-05-15__19--10--01/PER/history.dataeward: 13

[{'instance': PrioritizedExperienceReplayDDQN(env_name='CartPole-v1', memory_size=100000, save_every=50, render_every=0, num_episodes=3000, update_target_every=300, report_every=10, max_episode_len=700, batch_size=64, discount_rate=0.99, epsilon_max=1.0, epsilon_min=0.01, annealing_const=0.001, data_directory='data', random_init_steps=1000),
  'total_time': 4890.45441},
 {'instance': PrioritizedExperienceReplayDDQN(env_name='CartPole-v1', memory_size=100000, save_every=50, render_every=0, num_episodes=3000, update_target_every=300, report_every=10, max_episode_len=700, batch_size=64, discount_rate=0.99, epsilon_max=1.0, epsilon_min=0.01, annealing_const=0.001, data_directory='data', random_init_steps=1000),
  'total_time': 4344.298366},
 {'instance': PrioritizedExperienceReplayDDQN(env_name='CartPole-v1', memory_size=100000, save_every=50, render_every=0, num_episodes=3000, update_target_every=300, report_every=10, max_episode_len=700, batch_size=64, discount_rate=0.99, epsilon_max=1.0

In [22]:
reload(ddqn_per_class)

<module 'ddqn_per_class' from '/Users/liavkoren/AI Curriculum/dqn_implementations/mine/ddqn_per_class.py'>

In [23]:
ddq_per = ddqn_per_class.PrioritizedExperienceReplayDDQN('CartPole-v1', save_every=50, update_target_every=300, num_episodes=3000)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Starting to initializing memory with random agent.
Finished initializing memory with random agent.
Saved network to data/2018-05-16__10--12--49/PER/CartPole-v1_agent.params


In [24]:
ddq_per.run()

Episode: 90, steps: 2027, reward: 34.0, 10-episode mean reward: 14.8, 100-episode mean reward: 19.93 

KeyboardInterrupt: 

In [26]:
ddq_per_bigger_memory = ddqn_per_class.PrioritizedExperienceReplayDDQN(
    'CartPole-v1', 
    save_every=50, 
    update_target_every=300, 
    num_episodes=3000, 
    memory_size=int(1e6),
    render_every=20,
)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Starting to initializing memory with random agent.
Finished initializing memory with random agent.
Saved network to data/2018-05-16__10--13--32/PER/CartPole-v1_agent.params


In [27]:
ddq_per_bigger_memory.run()

Episode: 2990, steps: 603270, reward: 500.0, 10-episode mean reward: 467.9, 100-episode mean reward: 304.98 

Larger memory seems to make the q histories more stable, but convergence also was slower. What if we increase the target network update freq back to default?

In [31]:
ddq_per_bigger_memory2 = ddqn_per_class.PrioritizedExperienceReplayDDQN(
    'CartPole-v1', 
    save_every=50, 
    memory_size=int(1e6),
    render_every=20,
    num_episodes=3000, 
)
ddq_per_bigger_memory2.run()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Starting to initializing memory with random agent.
Finished initializing memory with random agent.
Saved network to data/2018-05-16__12--22--13/PER/CartPole-v1_agent.params
Episode: 2990, steps: 1090352, reward: 216.0, 10-episode mean reward: 360.2, 100-episode mean reward: 463.43 

# Using Importance Sampling Weights

In [54]:
reload(ddqn_per_class)

<module 'ddqn_per_class' from '/Users/liavkoren/AI Curriculum/dqn_implementations/mine/ddqn_per_class.py'>

In [56]:
agent = ddqn_per_class.PrioritizedExperienceReplayDDQN
use_importance_weights = benchmark(
    agent, 
    render_every=0, 
    save_every=50, 
    use_importance_weights=True,
    memory_size=int(1e6),
    report_every=1,
)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Starting to initializing memory with random agent.
Finished initializing memory with random agent.
Saved network to data/2018-05-17__15--31--18/PER/CartPole-v1_agent.params
2018-05-17 15:31:18.100732
Episode: 999, steps: 257769, reward: 275.0, 10-episode mean reward: 254.9, 100-episode mean reward: 255.28 
Run 0 total time: 17.43 mins
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Starting to initializing memory with random agent.
Finished initializing memory with random agent.
Saved network to data/2018-05-17__15--48--44/PER/CartPole-v1_agent.params
2018-05-17 15:48:44.079839
Episode: 999, steps: 209173, reward: 224.0, 10-episode mean reward: 325.0, 100-episode mean reward: 286.57 
Run 1 total time: 14.24 mins
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
St

In [3]:
agent = ddqn_per_class.PrioritizedExperienceReplayDDQN
use_importance_weights = benchmark(
    agent, 
    render_every=0, 
    save_every=50, 
    use_importance_weights=False,
    memory_size=int(1e6),
    report_every=1,
)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Starting to initializing memory with random agent.
Finished initializing memory with random agent.
Saved network to data/2018-06-26__15--29--21/PER/CartPole-v1_agent.params
2018-06-26 15:29:18.630575
Episode: 136, steps: 6129, reward: 201.0, 10-episode mean reward: 157.2, 100-episode mean reward: 53.14 

KeyboardInterrupt: 