In [1]:
import gymnasium as gym
import numpy as np
import tensorflow as tf

import bc as BC
import trajectories as T
import suboptimality as S
import evaluation as E

2023-11-28 01:13:47.152415: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Load and Augment Trajectories

In [2]:
trajectories = T.discretize_trajectories(T.trajectories)

In [3]:
env = gym.make('MountainCar-v0')

In [4]:
observational_ambiguity_trajectories = []
for traj in trajectories:
    observational_ambiguity_trajectories.append(S.observation_ambiguity(traj, env, 0.2))

In [5]:
action_ambiguity_trajectories = []
for traj in trajectories:
    action_ambiguity_trajectories.append(S.action_ambiguity(traj, env, 0.2))

In [6]:
target_states = [(3, 5), (7, 7), (8, 8), (12, 11), (15, 10)]

static_occlusion_trajectories = []
for traj in trajectories:
    static_occlusion_trajectories.append(S.static_occlusion(traj, target_states))

In [7]:
dynamic_occlusion_trajectories = []
for traj in trajectories:
    dynamic_occlusion_trajectories.append(S.dynamic_occlusion(traj, 0.2))

In [8]:
suboptimal_trajectories = [observational_ambiguity_trajectories,
                           action_ambiguity_trajectories,
                           static_occlusion_trajectories,
                           dynamic_occlusion_trajectories]

converted_trajectories = []
for i in suboptimal_trajectories:
    converted_trajectories.append(BC.convert_trajectories(i))

### Train Policies

In [9]:
policies = []
for s, a in converted_trajectories:
    bc_model = BC.model((s,a))
    bc_policy = BC.policy(bc_model)
    policies.append(bc_policy)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: [0.5540763735771179, 0.823913037776947]
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: [0.5534578561782837, 0.823913037776947]
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: [0.5518237948417664, 0.823913037776947]
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: [0.5520044565200806, 0.8260869383811951]


### Evaluate Polices

In [11]:
rewards = []
for i in policies:
    rewards.append(E.average_reward(env, i, 5))

In [12]:
rewards

[-9.0, 1.0, -434.8, -15.8]