In [1]:
from rl_gans.model.model import SAC_Model
from rl_gans.algos.sac import SAC
from rl_gans.utils.argument import parse_args
from pathlib import Path

import torch
import numpy as np
import gym
import time
import os
import json
from pathlib import Path
import wandb

In [2]:
args = parse_args()
args.agent = "sac"
args.env_image_size = 84
args.agent_image_size = 84
device = 'cuda'
args.save_tb = True
#Modify args for test run

args.num_train_steps = 10000
args.eval_freq = 2000
args.init_steps= 1000
args.numupdates= 100
args.save_video = False
print(args)

Namespace(action_repeat=4, actor_beta=0.9, actor_log_std_max=2, actor_log_std_min=-10, actor_lr=0.001, actor_update_freq=2, agent='sac', agent_image_size=84, alpha_beta=0.5, alpha_lr=0.0001, batch_size=128, critic_beta=0.9, critic_encoder_tau=0.05, critic_lr=0.001, critic_target_update_freq=2, critic_tau=0.01, detach_encoder=False, discount=0.99, discriminator_beta=0.5, discriminator_lr=0.0001, discriminator_update_freq=2, domain_name='cheetah', encoder_feature_dim=50, env_image_size=84, eval_freq=2000, frame_stack=3, generator_beta=0.5, generator_lr=0.0001, generator_update_freq=1, hidden_dim=1024, image_pad=None, init_steps=1000, init_temperature=0.1, log_interval=25, num_eval_episodes=10, num_filters=32, num_layers=4, num_train_steps=10000, numupdates=100, replay_buffer_capacity=100000, save_buffer=True, save_model=False, save_tb=True, save_video=False, seed=1, tag='', task_name='run', work_dir='.')


In [3]:
run = wandb.init(
            project="SAC Test",
            entity="karamdaaboul",
            name=None,
            config=args,
            #sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
            monitor_gym=True,  # auto-upload the videos of agents playing the game
            save_code=True,  # optional
        )

[34m[1mwandb[0m: Currently logged in as: [33msafe-transfer-learning-in-changing-environments[0m (use `wandb login --relogin` to force relogin)


In [4]:
wandb.tensorboard.patch(root_logdir=f'{args.work_dir}/tb', pytorch=True)

In [5]:
from rl_gans.memory import ReplayBufferStorage
from rl_gans.memory.replay_buffer import make_replay_buffer
from rl_gans.utils.misc import set_seed_everywhere, make_dir, VideoRecorder, eval_mode
from rl_gans.utils.logger import Logger

In [6]:
ts = time.strftime("%m-%d", time.gmtime())
env_name = args.domain_name + '-' + args.task_name
exp_name = env_name + '-' + ts + '-im' + str(args.env_image_size) +'-b'  \
+ str(args.batch_size) + '-s' + str(args.seed)  + '-' + args.agent
args.work_dir = args.work_dir + '/'  + exp_name
make_dir(args.work_dir)
video_dir = make_dir(os.path.join(args.work_dir, 'video'))
model_dir = make_dir(os.path.join(args.work_dir, 'model'))

os.environ['MKL_SERVICE_FORCE_INTEL'] = '1'
os.environ['MUJOCO_GL'] = 'egl'
video = VideoRecorder(dir_name = video_dir if args.save_video else None)

print(args.work_dir)

./cheetah-run-06-01-im84-b128-s1-sac


In [7]:
with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
    json.dump(vars(args), f, sort_keys=True, indent=4)

In [8]:
# prepare env
from rl_gans.wrappers.pixel_observation_wrapper import PixelObservation
env = gym.make("HalfCheetah-v3")
print(env._max_episode_steps)
env = PixelObservation(env,observation_size= args.env_image_size ,normalize=False)

eval_env = gym.make("HalfCheetah-v3")
eval_env = PixelObservation(env,observation_size= args.env_image_size ,normalize=False)


1000
Creating window glfw


In [9]:
action_shape = env.action_space.shape
args.env_image_size = 84
agent_obs_shape = (3, args.agent_image_size, args.agent_image_size)
env_obs_shape = (3, args.env_image_size, args.env_image_size)

action_shape = env.action_space.shape
print(f"action_shape: {action_shape}")
observation_shape = env.observation_space.shape
print(f"observation_shape: Agent {agent_obs_shape}, Environment {env_obs_shape}")

action_shape: (6,)
observation_shape: Agent (3, 84, 84), Environment (3, 84, 84)


Define the sac model

In [10]:
from rl_gans.model import SAC_Model

sac_model = SAC_Model(obs_shape = env_obs_shape,
                     action_shape        = action_shape,       
                     hidden_dim          = args.hidden_dim,
                     encoder_feature_dim = args.encoder_feature_dim,
                     log_std_min         = args.actor_log_std_min,
                     log_std_max         = args.actor_log_std_max,
                     num_layers          = args.num_layers, 
                     num_filters         = args.num_filters, 
                     device  = device)

Define the agent 

In [11]:
from rl_gans.algos.sac import SAC

args.detach_encoder
 
agent = SAC(model       = sac_model, 
            device      = device, 
            action_shape=action_shape,  
            args        = args)

In [12]:
def evaluate(env, agent, video, num_episodes, L, step, tag=None):
    episode_rewards = []
    for i in range(num_episodes):
        obs = env.reset()
        video.init(enabled=(i==0))
        done = False
        episode_reward = 0
        while not done:
            with eval_mode(agent):
                action = agent.select_action(obs)
            obs, reward, done, _ = env.step(action)
            video.record(env)
            episode_reward += reward

        if L is not None:
            video.save(f'{step}.mp4')
            L.log(f'eval/episode_reward', episode_reward, step)
        episode_rewards.append(episode_reward)
    
    return np.mean(episode_rewards)

In [13]:
print(agent)
print(sac_model)
print(args.init_steps)

<rl_gans.algos.sac.SAC object at 0x7ff05b543250>
SAC_Model(
  (actor): Actor(
    (encoder): Encoder(
      (cnn): SharedCNN(
        (layers): Sequential(
          (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2))
          (1): ReLU()
          (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
          (3): ReLU()
          (4): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
          (5): ReLU()
          (6): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
          (7): Flatten()
        )
      )
      (projection): RLProjection(
        (projection): Sequential(
          (0): Linear(in_features=39200, out_features=50, bias=True)
          (1): LayerNorm((50,), eps=1e-05, elementwise_affine=True)
          (2): Tanh()
        )
      )
    )
    (mlp): Sequential(
      (0): Linear(in_features=50, out_features=1024, bias=True)
      (1): ReLU()
      (2): Linear(in_features=1024, out_features=1024, bias=True)
      (3): ReLU()
      (4): Linear(in_features=1024, 

In [14]:
args.image_pad = None

In [15]:
args.work_dir

'./cheetah-run-06-01-im84-b128-s1-sac'

In [16]:
# run
replay_storage = ReplayBufferStorage(Path(args.work_dir) / 'buffer')
replay_buffer = None

L = Logger(args.work_dir, use_tb=args.save_tb, config=args.agent)



In [18]:
episode, episode_reward, done, info = 0, 0, True, {}
start_time = time.time()

for step in range(args.num_train_steps+1):
    # evaluate agent periodically

    if step > 0 and step % args.eval_freq == 0:
        print("evaluation")
        L.log('eval/episode', episode, step)
        with torch.no_grad():
            #evaluate(eval_env, agent, video, args.num_eval_episodes, L, step)
            evaluate(eval_env, agent, video, 3, L, step)
        if args.save_model:
            agent.save_model(model_dir, step)

    if done:
        if step > 0:
            replay_storage.add(obs, None, None, True)  # add the last observation for each episode
            if step % args.log_interval == 0:
                L.log('train/episode_reward', episode_reward, step)
                L.log('train/duration', time.time() - start_time, step)
                L.dump(step)
            start_time = time.time()

        obs = env.reset()
        done = False
        episode_reward = 0
        episode_step = 0
        episode += 1
        print("episode", episode)
        if step % args.log_interval == 0:
            L.log('train/episode', episode, step)

    # sample action for data collection
    if step < args.init_steps:
        action = env.action_space.sample()
    else:
        with eval_mode(agent):
            action = agent.sample_action(obs)

    # run training update
    if step >= args.init_steps:
        if replay_buffer is None:
            replay_buffer = make_replay_buffer(replay_dir=Path(args.work_dir) / 'buffer',
                                               replay_type="Normal",
                                               max_size=args.replay_buffer_capacity,
                                               batch_size=args.batch_size,
                                               num_workers=1,
                                               save_snapshot=False,
                                               nstep=1,
                                               discount=args.discount,
                                               obs_shape=env_obs_shape,
                                               device=device,
                                               image_size=args.agent_image_size,
                                               image_pad=args.image_pad)
            print(replay_buffer.sample)


        num_updates = 1 if step > args.init_steps else args.init_steps
        for _ in range(5):
            agent.update(replay_buffer, L, step)

    next_obs, reward, done, info = env.step(action)

    # allow infinit bootstrap
    done_bool = 0 if episode_step + 1 == 1000 else float(done)
    episode_reward += reward
    replay_storage.add(obs, action, reward, done_bool)    

    obs = next_obs
    episode_step += 1       

if run != None:
    run.finish()

episode 1
| [33mtrain[0m | E: 1 | S: 1000 | D: 1.0 s | R: -243.7209 | BR: 0.0000 | A_LOSS: 0.0000 | CR_LOSS: 0.0000
episode 2
<bound method ReplayBuffer.sample of <rl_gans.memory.replay_buffer.ReplayBuffer object at 0x7ff09891e490>>
evaluation
| [33mtrain[0m | E: 2 | S: 2000 | D: 746.0 s | R: -147.5789 | BR: -0.2456 | A_LOSS: 0.0000 | CR_LOSS: 0.3294
| [32meval[0m | S: 2000 | ER: -153.7922
episode 3
| [33mtrain[0m | E: 3 | S: 3000 | D: 725.5 s | R: -180.6900 | BR: -0.1907 | A_LOSS: 0.0000 | CR_LOSS: 0.2758
episode 4
evaluation
| [33mtrain[0m | E: 4 | S: 4000 | D: 730.1 s | R: -134.8358 | BR: -0.1857 | A_LOSS: 0.0000 | CR_LOSS: 0.1779
| [32meval[0m | S: 4000 | ER: -223.0792
episode 5
| [33mtrain[0m | E: 5 | S: 5000 | D: 723.9 s | R: -187.0819 | BR: -0.1731 | A_LOSS: 0.0000 | CR_LOSS: 0.1911
episode 6
evaluation
| [33mtrain[0m | E: 6 | S: 6000 | D: 731.3 s | R: -247.3464 | BR: -0.1772 | A_LOSS: 0.0000 | CR_LOSS: 0.1616
| [32meval[0m | S: 6000 | ER: -19.6008
episode 7
| 

VBox(children=(Label(value='0.455 MB of 0.455 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/episode,▁▃▅▆█
eval/episode_reward,▄▁█▂▃
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/batch_reward,▃▂▅▁▁▄▆▆▆▅█▄▄▄▆▆▅▄▆█▄▅▆▄▇▆▆▄▅▆▆▄▇▄▆▃▄▄▅▅
train/duration,▁█████████
train/episode,▁▂▂▃▄▅▅▆▇▇█
train/episode_reward,▁▅▄▆▄▁█▆▄▃
train_actor/entropy,██▆▆▆▅▃▃▄▄▄▄▃▄▄▃▃▄▃▃▃▃▃▃▃▃▃▂▁▂▁▁▂▁▁▁▁▂▂▂
train_actor/loss,██▇▇▆▇▆▆▅▅▅▄▄▃▃▂▂▂▂▂▂▂▂▁▁▂▁▂▁▂▂▁▁▁▁▁▁▁▁▁
train_actor/target_entropy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/episode,10.0
eval/episode_reward,-186.19005
global_step,10000.0
train/batch_reward,-0.0902
train/duration,731.51984
train/episode,11.0
train/episode_reward,-189.74002
train_actor/entropy,2.50255
train_actor/loss,-35.97055
train_actor/target_entropy,-6.0


In [21]:
print(f"args.eval_freq: {args.eval_freq}")
print(f"args.init_steps: {args.init_steps}")
print(f"args.num_train_steps+1: {args.num_train_steps+1}")
print(episode)

args.eval_freq: 2000
args.init_steps: 1000
args.num_train_steps+1: 10001
11
