### Testing the ENV

#### 1.Discrete Space

In [4]:
def sample_action(action_spec):
    """Draw a random integer action that fits the BoundedArraySpec."""
    low  = action_spec.minimum
    high = action_spec.maximum
    # note: high is inclusive in the BoundedArraySpec, so +1 for randint
    return np.random.randint(low, high + 1, size=action_spec.shape, dtype=action_spec.dtype)

In [9]:
import env
from env import Env_Discrete
import numpy as np
from tf_agents.environments import py_environment
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts
from tf_agents.trajectories.time_step import StepType

In [10]:
env = Env_Discrete()

# 1) check specs
a_spec = env.action_spec()
o_spec = env.observation_spec()
print("Action spec:", a_spec)
print("Observation spec:", o_spec)

# 2) reset
ts = env.reset()
print("\nAfter reset:")
print("  step_type:", "FIRST" if ts.step_type == StepType.FIRST else ts.step_type)
print("  obs      :", ts.observation)
print("  reward   :", ts.reward)
print("  discount :", ts.discount)

# 3) take 5 random steps
print("\nStepping 5 times with random actions:")
for i in range(5):
    a = sample_action(a_spec)
    ts = env.step(a)
    st = ("MID" if ts.step_type == StepType.MID else
          "LAST" if ts.step_type == StepType.LAST else
          ts.step_type)
    print(f" Step {i+1:>2}: action={a} → obs={ts.observation}, reward={ts.reward:.4f}, step_type={st}")
    if ts.step_type == StepType.LAST:
        break


Action spec: BoundedArraySpec(shape=(3,), dtype=dtype('int32'), name='action', minimum=[0 0 0], maximum=[6 4 2])
Observation spec: BoundedArraySpec(shape=(3,), dtype=dtype('float32'), name='observation', minimum=-3.4028234663852886e+38, maximum=3.4028234663852886e+38)

After reset:
  step_type: FIRST
  obs      : [-1.5 -1.5 -1.5]
  reward   : 0.0
  discount : 1.0

Stepping 5 times with random actions:


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [12]:
env.step(a)

TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([-0.9, -1.3, -3.5], dtype=float32),
 'reward': array([   3.8809,    2.9369, -127.6375], dtype=float32),
 'step_type': array([1, 1, 1], dtype=int32)})

In [12]:
a_spec.maximum - a_spec.minimum + 1

array([  5,   3, 100], dtype=int32)

In [14]:
np.sum(a_spec.minimum)

0

#### 2.Continouse Space

In [6]:
def sample_continuous(action_spec):
    """Uniformly sample a float vector in [minimum, maximum]."""
    low, high = action_spec.minimum, action_spec.maximum
    return np.random.uniform(low, high, size=action_spec.shape).astype(action_spec.dtype)

In [7]:
env = Env_Continue()

# 1) check specs
a_spec = env.action_spec()
o_spec = env.observation_spec()
print("Action spec:", a_spec)
print("Observation spec:", o_spec)

# 2) reset
ts = env.reset()
print("\nAfter reset:")
print("  step_type:", "FIRST" if ts.step_type == StepType.FIRST else ts.step_type)
print("  obs      :", ts.observation)
print("  reward   :", ts.reward)
print("  discount :", ts.discount)

# 3) take 5 random steps
print("\nStepping 5 times with random actions:")
for i in range(5):
    a = sample_continuous(a_spec)
    ts = env.step(a)
    st = ("MID" if ts.step_type == StepType.MID else
          "LAST" if ts.step_type == StepType.LAST else
          ts.step_type)
    print(f" Step {i+1:>2}: action={a} → obs={ts.observation}, reward={ts.reward:.4f}, step_type={st}")
    if ts.step_type == StepType.LAST:
        break

Action spec: BoundedArraySpec(shape=(3,), dtype=dtype('float32'), name='action', minimum=[-2.  3.  1.], maximum=[  2.   5. 100.])
Observation spec: BoundedArraySpec(shape=(3,), dtype=dtype('float32'), name='observation', minimum=-3.4028234663852886e+38, maximum=3.4028234663852886e+38)

After reset:
  step_type: FIRST
  obs      : [0. 0. 0.]
  reward   : 0.0
  discount : 1.0

Stepping 5 times with random actions:
 Step  1: action=[ 1.8997207  4.6012278 89.42736  ] → obs=[ 1.8997207  4.6012278 89.42736  ], reward=-8022.0332, step_type=MID
 Step  2: action=[-1.957699   4.7816534 73.24402  ] → obs=[-5.7978272e-02  9.3828812e+00  1.6267139e+02], reward=-26550.0215, step_type=MID
 Step  3: action=[-0.91692525  4.8964863  64.06855   ] → obs=[ -0.9749035  14.279367  226.73993  ], reward=-51615.8477, step_type=MID
 Step  4: action=[ 0.9561594  3.2213173 55.088135 ] → obs=[-1.8744111e-02  1.7500685e+01  2.8182806e+02], reward=-79733.3281, step_type=MID
 Step  5: action=[-0.28203082  4.453713   4

### End Env Test

In [1]:
import discrete_RL_train
from discrete_RL_train import discrete_RL_train as RLTrain

2025-07-05 17:46:31.942278: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-05 17:46:31.944927: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-05 17:46:31.982828: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-05 17:46:31.983861: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-07-05 17:46:33.928505: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuI

In [2]:
trainner = RLTrain()

In [3]:
trainner.train()

train_env.batch_size = parallel environment number =  6
Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=True)` instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=True)` instead.


[step   0] mean entropy per-dim = 2.0641
final reward before udpate: -1000000000.0
final reward after udpate: -14.0625
updated final_solution= [-1.5 -1.5 -1.5]
train_step no.= 1
best_solution of this generation= [-1.5 -1.5 -1.5]
best step reward= -14.062
avg step reward= -102304.055
best_step_index: [0, 1]
collect_traj: [-1.5       -1.5       -1.5       -1.6       -1.6       -1.7
 -1.8       -1.8       -1.9       -2.        -2.        -2.1
 -2.1       -2.2       -2.3       -2.4       -2.5       -2.5
 -2.6       -2.6       -2.7       -2.7       -2.8       -2.8
 -2.9       -3.        -3.        -3.        -3.1       -3.1
 -3.1       -3.2       -3.2       -3.3       -3.3       -3.3
 -3.3       -3.3       -3.4       -3.4       -3.4       -3.4
 -3.4       -3.4       -3.5       -3.5       -3.5       -3.6000001
 -3.7       -3.8      ]
test_traj [-1.5       -1.6       -1.7       -1.8       -1.9       -2.
 -2.1       -2.2       -2.3       -2.4       -2.5       -2.6
 -2.7       -2.8       -2.9  

([-14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -14.0625,
  -1

In [13]:
state = [1,2,3]

In [14]:
-np.sum(np.square(state))

-14

In [15]:
np.square(state)

array([1, 4, 9])