In [1]:
import argparse
import datetime
import os
import pprint
import yaml

import gym, gym_xarm
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter

from tianshou.data import Collector, ReplayBuffer, VectorReplayBuffer, Batch
from tianshou.env import DummyVectorEnv, SubprocVectorEnv
from her.offpolicy import offpolicy_trainer
from tianshou.utils import TensorboardLogger
from tianshou.utils.net.common import Net
from tianshou.utils.net.continuous import ActorProb, Critic
from tianshou.policy import SACPolicy, BasePolicy

import gym_naive
from her.her_collector import HERCollector
from her.sac_her_policy import SACHERPolicy

'''
make env
'''
config = {
    'dim': 2,
    'reward_type': 'sparse'
}
def make_env():
    return gym.wrappers.FlattenObservation(gym.make('Incremental-v0', config = config))
env = gym.make('Incremental-v0', config = config)
observation_space = env.observation_space
env = gym.wrappers.FlattenObservation(env)
obs = (env.reset())
state_shape = len(obs)
action_shape = env.action_space.shape or env.action_space.n
train_envs = SubprocVectorEnv(
    [make_env]*2,
    norm_obs = False
)
test_envs = DummyVectorEnv(
    [make_env],
)

'''
build and init network
'''
# actor
net_a = Net(state_shape, hidden_sizes=[16], device='cpu')
actor = ActorProb(
    net_a,
    action_shape,
    max_action=env.action_space.high[0],
    device='cpu',
    unbounded=True,
    conditioned_sigma=True
).to('cpu')
actor_optim = torch.optim.Adam(actor.parameters(), lr=0.001)
# critic
net_c1 = Net(
    state_shape,
    action_shape,
    hidden_sizes=[16],
    concat=True,
    device='cpu'
)
net_c2 = Net(
    state_shape,
    action_shape,
    hidden_sizes=[16],
    concat=True,
    device='cpu'
)
critic1 = Critic(net_c1, device='cpu').to('cpu')
critic1_optim = torch.optim.Adam(critic1.parameters(), lr=0.001)
critic2 = Critic(net_c2, device='cpu').to('cpu')
critic2_optim = torch.optim.Adam(critic2.parameters(), lr=0.001)

'''
set up policy
'''
policy = SACHERPolicy(
    actor,
    actor_optim,
    critic1,
    critic1_optim,
    critic2,
    critic2_optim,
    tau=0.005,
    gamma=0.9,
    alpha=0.2,
    estimation_step=2,
    action_space=env.action_space,
    reward_normalization = False,
    dict_observation_space = observation_space,
    reward_fn = env.compute_reward, 
    future_k = 2,
)

'''
set up collector
'''
train_buffer = VectorReplayBuffer(64, 2)
train_collector = HERCollector(policy, train_envs, train_buffer, exploration_noise=True, observation_space = observation_space, reward_fn = env.compute_reward, k = 2, strategy='offline')
# train_collector = Collector(policy, train_envs, train_buffer, exploration_noise=True)
test_collector = Collector(policy, test_envs)



In [26]:
train_collector.env.workers[1].parent_remote.send(["change", {'step':3}])


In [27]:
train_collector.env.step([1,1])

(array([[131.,  36.,  26.],
        [108.,  13.,   3.]], dtype=float32),
 array([1., 0.]),
 array([ True, False]),
 array([{'is_success': 1.0, 'future_length': -21, 'achieved_goal': None, 'env_id': 0},
        {'is_success': 0.0, 'future_length': 2, 'achieved_goal': None, 'env_id': 1}],
       dtype=object))

In [None]:
'''
trainer
'''
result = offpolicy_trainer(
    policy = policy,
    train_collector= train_collector,
    test_collector= test_collector,
    max_epoch= 1,
    step_per_epoch= 100,
    step_per_collect= 10,
    episode_per_test= 1,
    batch_size=4,
    update_per_step=0.1
)

In [None]:
import pandas as pd
from gym.spaces.utils import unflatten, flatten
df = pd.DataFrame()

for i, buffer in enumerate(train_buffer.buffers):
    ####DEBUG
    for data in buffer:
        obs_dict = unflatten(observation_space, data.obs)
        obs_next_dict = unflatten(observation_space, data.obs_next)
        data={
            'buffer_ID': i,
            'obs':obs_dict['observation'][0], 
            'ag':obs_dict['achieved_goal'][0], 
            'g':obs_dict['desired_goal'][0], 
            'obs_n':obs_next_dict['observation'][0], 
            'ag_n':obs_next_dict['achieved_goal'][0], 
            'g_n':obs_next_dict['desired_goal'][0], 
            'done': data.done,
            'rew': data.rew,
        }
        df = df.append(data, ignore_index=True)
    ####DEBUG
df.to_csv('log.csv')

In [None]:
import numpy as np
buffer_size = 5
done_index = np.array([3,8])
current_index = np.array([1,3,4,6,9])
final_index = []
for idx in current_index:
    buffer_idx = int(idx/buffer_size)
    current_done_index = done_index[np.logical_and(done_index>idx, done_index<(buffer_idx+1)*buffer_size)]
    if len(current_done_index)==0:
        final_index.append(idx)
    else:
        final_index.append(min(current_done_index))
print(final_index)

In [3]:
import numpy as np
a=np.array([1,2,3])
a[-1]=2
a

array([1, 2, 2])