# Imports

In [None]:
import sys
sys.path.append('../')

import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator, LinearLocator)
from matplotlib.ticker import ScalarFormatter
import matplotlib.ticker as mtick
from collections import OrderedDict
import pandas as pd
import numpy as np
# import dill
import argparse
import pyrep.backend.sim as sim
from networks.structures import PolicyNetwork, ValueNetwork, SoftQNetwork
import torch

from sim_framework.envs.drone_env import DroneEnv
import time
import itertools
import random
%matplotlib inline

# Utils

In [None]:
def custom_reset(env, variant):
#     env.pr.stop()
    env.current_action = np.array([0,0,0,0])
    
#     env.agent.set_position(np.round([0,0,1.7],2).tolist())
#     env.agent.set_orientation(np.round([0,0,0],2).tolist())

    env.agent.set_orientation(np.round(variant[3:],2).tolist())
    env.agent.set_position(np.round(variant[:3],2).tolist())

    env.agent.set_thrust_and_torque(np.asarray([0.] * 4), force_zero=True)
    env.agent.set_joint_positions(env.initial_joint_positions)
    env.agent.set_joint_target_velocities(env.initial_joint_velocities)
    env.agent.set_joint_target_positions(env.initial_joint_target_positions)
    
#     env.agent.set_orientation(np.round([0,0,0],2).tolist())
   

    env.first_obs=True
    env._make_observation()
    env.last_state = env.observation[:18]    
#     env.pr.start()
    
    return env.observation

In [None]:

def state_to_tensor(state, device):
    """Transform numpy array to torch tensor"""
    if args.use_double:
        return torch.DoubleTensor(state).unsqueeze(0).to(device)
    else:
        return torch.FloatTensor(state).unsqueeze(0).to(device)



In [17]:


def rollouts(
        variant,
        env,
        policy,
        action_range,
        device,
        max_timesteps=1000,
        time_horizon=250):
    """
    Perform policy rollouts until a max given number of steps

    Parameters
    ----------
    env :
        A larocs_sim environment
    policy :
        An actor-policy for the agent act in the environment
    action_range : list
        Range of possible float values for the action
    max_timesteps : int, optional
        Number of timesteps to perform while interacting with the environment, by default 1000
    time_horizon : int, optional
        The number of steps for each episode, by default 250

    """
    count = 0
    dones = False
    set_of_obs, set_of_next_obs, set_of_rewards, set_of_actions, set_of_dones, set_of_infos = [], [], [], [], [], []

    rollout = -1

    mb_obs, mb_next_obs, mb_rewards, mb_actions, mb_dones, mb_infos = [], [], [], [], [], []
    rollout += 1

#         obs0 = env.reset()
    obs0 = custom_reset(env,variant)


    for j in range(time_horizon):
        dones = False
        try:
            actions, agent_info = policy.deterministic_action(
                state_to_tensor(obs0, device))
        except:
            actions = policy.deterministic_action(
                state_to_tensor(obs0, device))

        # Take actions in env and look the results
        obs1, rewards, dones, infos = env.step(actions * action_range[1])
        # Append on the experience buffers
        mb_obs.append(obs0)
        # mb_obs.append(obs0)
        mb_next_obs.append(obs1)
        mb_actions.append(actions)
        mb_dones.append(dones)
        mb_rewards.append(rewards)
        mb_infos.append(infos)

        count += 1
        if dones:
            break

        obs0 = obs1
#         print()
#         print('rewards: mean = {0}'.format(np.mean(mb_rewards)))
    print('rewards: sum = {0}'.format(np.sum(mb_rewards)))

    set_of_obs.append(mb_obs)
    set_of_next_obs.append(mb_next_obs)
    set_of_rewards.append(mb_rewards)
    set_of_actions.append(mb_actions)
    set_of_dones.append(mb_dones)
    set_of_infos.append(mb_infos)

    set_tau = {'obs': set_of_obs,
                           'next_obs': set_of_next_obs,
                           'rewards': set_of_rewards,
                           'actions': set_of_actions,
                           'dones': set_of_dones,
                           'infos': set_of_infos}
    return set_tau

# Variables

In [28]:
class Args():
    def __init__(self):
        pass


args = Args()

args.H = 250

args.max_timesteps=250


env_reset_mode = "Discretized_Uniform"
seed = 42
headless = True
# headless = False

state='New_action'
reward='Normal'
try:
    env.shutdown()
except:
    pass
env = DroneEnv(random=env_reset_mode,seed=seed, headless = headless, state=state,\
               reward_function_name=reward)


# Permutation List

In [11]:
## Ticks for Discretized_Uniform initialization setup
xy_ticks = env.x_y_ticks
z_ticks = env.z_ticks
ang_ticks = env.ang_ticks

extreme_angles = ang_ticks[[0,-1]]
extreme_angles = np.append(extreme_angles, 0)
extreme_xyticks = xy_ticks[[0,-1]]
extreme_zticks = z_ticks[[0,-1]]



all_list = [extreme_xyticks,extreme_xyticks, np.round( extreme_zticks,2), extreme_angles,extreme_angles,extreme_angles]


res = list(itertools.product(*all_list)) 
random.shuffle(res)
print('Number of different variants')
print(len(res))

Number of different variants
216


# Running

In [24]:
args.use_double=False
args.use_cuda=False

use_cuda = torch.cuda.is_available()

if use_cuda and (args.use_cuda):
    device = torch.device("cuda")
else:
    device = torch.device("cpu")


restore_path =  '../saved_policies/sac_optimal_policy_2.pt'

try:
    checkpoint = torch.load(restore_path, map_location='cpu')
except BaseException:
    checkpoint = torch.load(restore_path, map_location=torch.device('cpu'))
print('Finished Loading')

# Neural network parameters
try:
    state_dim = env.observation_space.shape[0]
except BaseException:
    state_dim = env.observation_space
action_dim = env.action_space.shape[0]
hidden_dim = checkpoint['linear1.weight'].data.shape[0]
action_range = [env.agent.action_space.low.min(
), env.agent.action_space.high.max()]
size_obs = checkpoint['linear1.weight'].data.shape[1]

assert size_obs == state_dim, 'Checkpoint state must be the same as the env'


policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)


Finished Loading


In [25]:
# Loading  Models
policy_net.load_state_dict(checkpoint)
print('Finished Loading the weights')


Finished Loading the weights


In [19]:
list_of_numsteps=[]
list_of_rewards = []
list_of_variants = []
begin = time.time()
for k, variant in enumerate(res):
    print(k)
    set_tau = rollouts(
        variant,
        env,
        policy_net,
        action_range,
        device,
        max_timesteps=args.max_timesteps,
        time_horizon=args.H)
    list_of_numsteps.append(len(set_tau['obs'][0]))
    list_of_rewards.append(np.sum(set_tau['rewards'][0]))
    list_of_variants.append(variant)
end = time.time()
print("Time = {0:.2f}".format(end-begin))

0
rewards: sum = 877.1200845534663
1
rewards: sum = 869.9280922522692
2
rewards: sum = 891.6576752246817
3
rewards: sum = 883.0101529152173
4
rewards: sum = 896.3264992912492
5
rewards: sum = 874.0152384834564
6
rewards: sum = 900.5373236322093
7
rewards: sum = 899.9096966483784
8
rewards: sum = 894.9760864179252
9
rewards: sum = 886.492229917679
10
rewards: sum = 901.0729021115104
11
rewards: sum = 887.2421559703803
12
rewards: sum = 889.0111172262613
13
rewards: sum = 860.6523525027959
14
rewards: sum = 862.3917533988092
15
rewards: sum = 888.6664420551124
16
rewards: sum = 904.3786044462072
17
rewards: sum = 897.1422578799277
18
rewards: sum = 904.8975850526614
19
rewards: sum = 878.4370341487543
20
rewards: sum = 896.7519630918687
21
rewards: sum = 871.8436541283222
22
rewards: sum = 878.558593293236
23
rewards: sum = 904.699504927172
24
rewards: sum = 880.1194983128975
25
rewards: sum = 878.523675280956
26
rewards: sum = 882.9461452790712
27
rewards: sum = 863.4715386745926
28
rew

In [20]:
df = pd.DataFrame({'Variant': list_of_variants, "Reward" : list_of_rewards, 'Len' : list_of_numsteps})

print('Percentage of successful runs')
print((1 - len(df[df['Len'] < 250])/len(df))*100)

display(df[['Reward']].describe().T)

print(df[['Reward']].median())

Percentage of successful runs
100.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Reward,216.0,886.135645,12.723838,857.749661,876.962865,886.816735,896.847522,908.599989


Reward    886.816735
dtype: float64


# Anedoctal Run

In [27]:
## Remember to chance headless flag!
env.pr.stop()
env.pr.start()

for variant in [[0,0,6,0,1.7,0]]:
     set_tau = rollouts(
        variant,
        env,
        policy_net,
        action_range,
        device,
        max_timesteps=args.max_timesteps,
        time_horizon=args.H)

rewards: sum = 842.9171796620125
