### Testing the ENV

#### 1.Discrete Space

In [1]:
def sample_action(action_spec):
    """Draw a random integer action that fits the BoundedArraySpec."""
    low  = action_spec.minimum
    high = action_spec.maximum
    # note: high is inclusive in the BoundedArraySpec, so +1 for randint
    return np.random.randint(low, high + 1, size=action_spec.shape, dtype=action_spec.dtype)

In [2]:
import Optimiser
import Optimiser.env
from Optimiser.env import Env_Discrete
import numpy as np
from tf_agents.environments import py_environment
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts
from tf_agents.trajectories.time_step import StepType

2025-07-09 16:35:57.343112: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-09 16:35:57.377651: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-09 16:35:57.377684: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-09 16:35:57.379155: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-09 16:35:57.386862: I tensorflow/core/platform/cpu_feature_guar

In [3]:
def my_weird_reward(x: np.ndarray) -> float:
    # ... whatever you want
    return -np.sum((x-3)**2)

In [6]:
env = Env_Discrete(reward_fn = my_weird_reward)

# 1) check specs
a_spec = env.action_spec()
o_spec = env.observation_spec()
print("Action spec:", a_spec)
print("Observation spec:", o_spec)

# 2) reset
ts = env.reset()
print("\nAfter reset:")
print("  step_type:", "FIRST" if ts.step_type == StepType.FIRST else ts.step_type)
print("  obs      :", ts.observation)
print("  reward   :", ts.reward)
print("  discount :", ts.discount)

# 3) take 5 random steps
print("\nStepping 5 times with random actions:")
for i in range(5):
    a = sample_action(a_spec)
    ts = env.step(a)
    st = ("MID" if ts.step_type == StepType.MID else
          "LAST" if ts.step_type == StepType.LAST else
          ts.step_type)
    print(f" Step {i+1:>2}: action={a} → obs={ts.observation}, reward={ts.reward:.4f}, step_type={st}")
    if ts.step_type == StepType.LAST:
        break


Action spec: BoundedArraySpec(shape=(4,), dtype=dtype('int32'), name='action', minimum=[0 0 0 0], maximum=[ 30   8  20 600])
Observation spec: BoundedArraySpec(shape=(4,), dtype=dtype('float32'), name='observation', minimum=-3.4028234663852886e+38, maximum=3.4028234663852886e+38)

After reset:
  step_type: FIRST
  obs      : [  1.   0.   1. -50.]
  reward   : 0.0
  discount : 1.0

Stepping 5 times with random actions:
 Step  1: action=[23  1 15 35] → obs=[ 9.0e+00 -1.5e+00  3.5e+00 -2.7e+03], reward=-7306265.5000, step_type=MID
 Step  2: action=[30  2 18 21] → obs=[ 2.40e+01 -2.50e+00  7.50e+00 -5.49e+03], reward=-30173540.0000, step_type=MID
 Step  3: action=[ 27   2  11 148] → obs=[ 3.60e+01 -3.50e+00  8.00e+00 -7.01e+03], reward=-49183324.0000, step_type=MID
 Step  4: action=[  0   5   3 163] → obs=[ 2.10e+01 -3.00e+00  4.50e+00 -8.38e+03], reward=-70275048.0000, step_type=MID
 Step  5: action=[ 18   7  11 350] → obs=[ 2.40e+01 -1.50e+00  5.00e+00 -7.88e+03], reward=-62142156.0000, 

In [7]:
env.step(a)

TimeStep(
{'step_type': array(1, dtype=int32),
 'reward': array(-54509280., dtype=float32),
 'discount': array(1., dtype=float32),
 'observation': array([ 2.70e+01,  0.00e+00,  5.50e+00, -7.38e+03], dtype=float32)})

In [8]:
a_spec.maximum - a_spec.minimum + 1

array([ 31,   9,  21, 601], dtype=int32)

In [9]:
np.sum(a_spec.minimum)

0

#### 2.Continouse Space

In [None]:
def sample_continuous(action_spec):
    """Uniformly sample a float vector in [minimum, maximum]."""
    low, high = action_spec.minimum, action_spec.maximum
    return np.random.uniform(low, high, size=action_spec.shape).astype(action_spec.dtype)

In [None]:
env = Env_Continue()

# 1) check specs
a_spec = env.action_spec()
o_spec = env.observation_spec()
print("Action spec:", a_spec)
print("Observation spec:", o_spec)

# 2) reset
ts = env.reset()
print("\nAfter reset:")
print("  step_type:", "FIRST" if ts.step_type == StepType.FIRST else ts.step_type)
print("  obs      :", ts.observation)
print("  reward   :", ts.reward)
print("  discount :", ts.discount)

# 3) take 5 random steps
print("\nStepping 5 times with random actions:")
for i in range(5):
    a = sample_continuous(a_spec)
    ts = env.step(a)
    st = ("MID" if ts.step_type == StepType.MID else
          "LAST" if ts.step_type == StepType.LAST else
          ts.step_type)
    print(f" Step {i+1:>2}: action={a} → obs={ts.observation}, reward={ts.reward:.4f}, step_type={st}")
    if ts.step_type == StepType.LAST:
        break

### End Env Test

In [10]:
import discrete_RL_train
from discrete_RL_train import discrete_RL_train as RLTrain

ModuleNotFoundError: No module named 'discrete_RL_train'

In [None]:
trainner = RLTrain()

In [None]:
trainner.train()

In [None]:
state = [1,2,3]

In [None]:
-np.sum(np.square(state))

In [None]:
np.square(state)