# Hindsight Experience Replay

Paper this research is based on: https://arxiv.org/abs/1707.01495

In [1]:
import torch
import gym
import gym.spaces
import rocket_lander_gym
import warnings
from agent.config import Config
from agent.ddpg_agent import DDPGAgent
from agent.td3_agent import TD3Agent
from agent.utils import seed_all, plot_scores

warnings.filterwarnings('ignore')

In [2]:
# RocketLander-v0 | LunarLanderContinuous-v2
env = gym.make('LunarLanderContinuous-v2')

In [3]:
config = Config()

config.seed = 0
config.env = env
config.env_solved = 200
config.times_solved = 100
config.buffer_size = int(1e6)
config.batch_size = 64
config.num_episodes = 3000
config.num_updates = 1 
config.max_steps = 2000
config.max_steps_reward = None
config.state_size = env.observation_space.shape[0]
config.action_size = env.action_space.shape[0]
config.gamma = 0.99
config.tau = 1e-3
config.lr_actor = 1e-4
config.lr_critic = 1e-3
config.hidden_actor = (128, 64)
config.hidden_critic = (128, 64)
config.activ_actor = torch.nn.ReLU()
config.activ_critic = torch.nn.ReLU()
config.optim_actor = torch.optim.Adam
config.optim_critic = torch.optim.Adam
config.grad_clip_actor = None
config.grad_clip_critic = None
config.use_huber_loss = False
config.update_every = 1
config.use_ou_noise = True
config.ou_mu = 0.0
config.ou_theta = 0.15
config.ou_sigma = 0.2
config.expl_noise = 0.1
config.noise_weight = 1.0
config.log_every = 100
config.policy_noise = 0.2
config.noise_clip = 0.5
config.policy_freq_update = 1

In [4]:
seed_all(config.seed, env)

In [5]:
agent = DDPGAgent(config)
# agent = TD3Agent(config)

In [6]:
agent.summary()

DDGP Agent:

Actor Network:
--------------
Actor(
  (activ): ReLU()
  (layers): ModuleList(
    (0): Linear(in_features=8, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=64, bias=True)
    (2): Linear(in_features=64, out_features=2, bias=True)
  )
)

Critic Network:
---------------
Critic(
  (activ): ReLU()
  (layers): ModuleList(
    (0): Linear(in_features=10, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=64, bias=True)
    (2): Linear(in_features=64, out_features=1, bias=True)
  )
)


In [7]:
scores = agent.train()

Episode 100	Avg Score: -280.18	Avg Actor Loss: 23.80	Avg Critic Loss: 104.65
Episode 200	Avg Score: -356.78	Avg Actor Loss: 50.12	Avg Critic Loss: 41.64
Episode 300	Avg Score: -365.18	Avg Actor Loss: 73.69	Avg Critic Loss: 21.52
Episode 400	Avg Score: -211.05	Avg Actor Loss: 94.26	Avg Critic Loss: 15.86
Episode 500	Avg Score: -120.89	Avg Actor Loss: 98.16	Avg Critic Loss: 13.70
Episode 600	Avg Score: -83.48	Avg Actor Loss: 85.52	Avg Critic Loss: 13.441
Episode 700	Avg Score: 11.90	Avg Actor Loss: 73.67	Avg Critic Loss: 14.456
Episode 800	Avg Score:  8.15	Avg Actor Loss: 63.68	Avg Critic Loss: 21.444
Episode 900	Avg Score: -14.59	Avg Actor Loss: 57.52	Avg Critic Loss: 24.03
Episode 1000	Avg Score: 18.82	Avg Actor Loss: 51.14	Avg Critic Loss: 30.82
Episode 1100	Avg Score: 37.74	Avg Actor Loss: 45.46	Avg Critic Loss: 27.34
Episode 1200	Avg Score: 52.95	Avg Actor Loss: 40.28	Avg Critic Loss: 30.58
Episode 1300	Avg Score: -143.25	Avg Actor Loss: 35.99	Avg Critic Loss: 35.69
Episode 1400	Avg

KeyboardInterrupt: 

In [None]:
plot_scores(scores, polyfit_deg=6)