In [1]:
import argparse
import datetime
import os
import pprint
import yaml

import gym, gym_xarm
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter

from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer, Batch
from tianshou.env import DummyVectorEnv, SubprocVectorEnv
from offpolicy import offpolicy_trainer
from tianshou.utils import TensorboardLogger
from tianshou.utils.net.common import Net
from tianshou.utils.net.continuous import ActorProb, Critic
from tianshou.policy import SACPolicy, BasePolicy

import gym_naive
from her_collector import HERCollector
from sac_her_policy import SACHERPolicy

'''
make env
'''
config = {
    'dim': 2,
    'reward_type': 'sparse'
}
def make_env():
    return gym.wrappers.FlattenObservation(gym.make('Incremental-v0', config = config))
env = gym.make('Incremental-v0', config = config)
observation_space = env.observation_space
env = gym.wrappers.FlattenObservation(env)
obs = (env.reset())
state_shape = len(obs)
action_shape = env.action_space.shape or env.action_space.n
train_envs = SubprocVectorEnv(
    [make_env]*2,
    norm_obs = False
)
test_envs = DummyVectorEnv(
    [make_env],
)

'''
build and init network
'''
# actor
net_a = Net(state_shape, hidden_sizes=[16], device='cpu')
actor = ActorProb(
    net_a,
    action_shape,
    max_action=env.action_space.high[0],
    device='cpu',
    unbounded=True,
    conditioned_sigma=True
).to('cpu')
actor_optim = torch.optim.Adam(actor.parameters(), lr=0.001)
# critic
net_c1 = Net(
    state_shape,
    action_shape,
    hidden_sizes=[16],
    concat=True,
    device='cpu'
)
net_c2 = Net(
    state_shape,
    action_shape,
    hidden_sizes=[16],
    concat=True,
    device='cpu'
)
critic1 = Critic(net_c1, device='cpu').to('cpu')
critic1_optim = torch.optim.Adam(critic1.parameters(), lr=0.001)
critic2 = Critic(net_c2, device='cpu').to('cpu')
critic2_optim = torch.optim.Adam(critic2.parameters(), lr=0.001)

'''
set up policy
'''
policy = SACHERPolicy(
    actor,
    actor_optim,
    critic1,
    critic1_optim,
    critic2,
    critic2_optim,
    tau=0.005,
    gamma=0.9,
    alpha=0.2,
    estimation_step=2,
    action_space=env.action_space,
    reward_normalization = False,
    dict_observation_space = observation_space,
    reward_fn = env.compute_reward, 
    future_k = 2,
)

'''
set up collector
'''
train_buffer = VectorReplayBuffer(128, 2)
train_collector = HERCollector(policy, train_envs, train_buffer, exploration_noise=True, observation_space = observation_space, reward_fn = env.compute_reward, k = 2, strategy='online')
# train_collector = Collector(policy, train_envs, train_buffer, exploration_noise=True)
test_collector = Collector(policy, test_envs)



In [None]:
train_collector.collect(100)

In [2]:
'''
trainer
'''
result = offpolicy_trainer(
    policy = policy,
    train_collector= train_collector,
    test_collector= test_collector,
    max_epoch= 1,
    step_per_epoch= 100,
    step_per_collect= 10,
    episode_per_test= 1,
    batch_size=4,
    update_per_step=0.1
)

Epoch #1:  70%|#######   | 70/100 [00:00<00:00, 170.00it/s, env_step=70, len=5, loss/actor=-45.797, loss/critic1=157019.016, loss/critic2=162117.569, n/ep=2, n/st=10, rew=1.00, succeed=1.00]

Batch(
    obs: array([[107.00002, 509.     ,   2.     ],
                [497.321  , 107.00002,   0.     ],
                [507.80615, 109.     ,   3.     ],
                [507.80615, 109.     ,   3.     ]], dtype=float32),
    act: array([[-0.97810835],
                [-1.        ],
                [-1.        ],
                [-1.        ]], dtype=float32),
    rew: array([[-396.62167  ],
                [  -1.0000229],
                [   0.       ],
                [   0.       ]], dtype=float32),
    done: array([False, False,  True,  True]),
    obs_next: array([[112.378334, 509.      ,   3.      ],
                     [106.      , 107.00002 ,   1.      ],
                     [109.      , 109.      ,   4.      ],
                     [109.      , 109.      ,   4.      ]], dtype=float32),
    info: Batch(
              future_length: array([2, 4, 1, 1]),
              achieved_goal: array([array([[112.378334],
                                    [509.      ]], dtype=float

  logger.write("train/env_step", env_step, {"train/succeed": result["succeed"].mean()})  # CHANGE add success record
  ret = ret.dtype.type(ret / rcount)
  data['succeed'] = f"{result['succeed'].mean():.2f}"
Epoch #1: 101it [00:00, 412.14it/s, env_step=100, len=0, loss/actor=-40.614, loss/critic1=147934.795, loss/critic2=152338.101, n/ep=0, n/st=10, rew=0.00, succeed=nan]                         

Batch(
    obs: array([[481.38922 , 109.      ,   0.      ],
                [112.      ,  18.      ,   7.      ],
                [106.      , 506.76486 ,   1.      ],
                [108.000854, 157.41391 ,   3.      ]], dtype=float32),
    act: array([[-1.        ],
                [-0.99999565],
                [-0.99999446],
                [-0.75793046]], dtype=float32),
    rew: array([[  -3.     ],
                [  95.00087],
                [-399.76376],
                [   0.     ]], dtype=float32),
    done: array([False, False, False,  True]),
    obs_next: array([[106.     , 109.     ,   1.     ],
                     [113.00087,  18.     ,   8.     ],
                     [107.00111, 506.76486,   2.     ],
                     [157.41391, 157.41391,   4.     ]], dtype=float32),
    info: Batch(
              future_length: array([ 4, -3,  3,  1]),
              achieved_goal: array([array([[106.      ],
                                    [107.001945],
                




In [None]:
import pandas as pd
from gym.spaces.utils import unflatten, flatten
df = pd.DataFrame()

for i, buffer in enumerate(train_buffer.buffers):
    ####DEBUG
    for data in buffer:
        obs_dict = unflatten(observation_space, data.obs)
        obs_next_dict = unflatten(observation_space, data.obs_next)
        data={
            'buffer_ID': i,
            'obs':obs_dict['observation'][0], 
            'ag':obs_dict['achieved_goal'][0], 
            'g':obs_dict['desired_goal'][0], 
            'obs_n':obs_next_dict['observation'][0], 
            'ag_n':obs_next_dict['achieved_goal'][0], 
            'g_n':obs_next_dict['desired_goal'][0], 
            'done': data.done,
            'rew': data.rew,
        }
        df = df.append(data, ignore_index=True)
    ####DEBUG
df.to_csv('log.csv')

In [None]:
import numpy as np
buffer_size = 5
done_index = np.array([3,8])
current_index = np.array([1,3,4,6,9])
final_index = []
for idx in current_index:
    buffer_idx = int(idx/buffer_size)
    current_done_index = done_index[np.logical_and(done_index>idx, done_index<(buffer_idx+1)*buffer_size)]
    if len(current_done_index)==0:
        final_index.append(idx)
    else:
        final_index.append(min(current_done_index))
print(final_index)