In [1]:
import gymenv_v2
from gymenv_v2 import make_multiple_env
import numpy as np
from config import custom_config, easy_config, hard_config
from layers import Embedding
import tensorflow as tf
from policy import Policy, RandomPolicy
from rollout import rollout, rollout_multiple
import tensorflow_probability as tfp
from utils import discounted_rewards, AdamOptimizer
import os
import time

import wandb
wandb.login()
run=wandb.init(project="finalproject", entity="orcs4529", tags=["training-easy"])

%load_ext autoreload
%autoreload 2

2022-12-15 06:08:49.158913: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-15 06:08:50.586462: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2022-12-15 06:08:50.586682: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/loca

In [2]:
env = make_multiple_env(**easy_config) 

loading training instances, dir instances/train_10_n60_m60 idx 0
loading training instances, dir instances/train_10_n60_m60 idx 1
loading training instances, dir instances/train_10_n60_m60 idx 2
loading training instances, dir instances/train_10_n60_m60 idx 3
loading training instances, dir instances/train_10_n60_m60 idx 4
loading training instances, dir instances/train_10_n60_m60 idx 5
loading training instances, dir instances/train_10_n60_m60 idx 6
loading training instances, dir instances/train_10_n60_m60 idx 7
loading training instances, dir instances/train_10_n60_m60 idx 8
loading training instances, dir instances/train_10_n60_m60 idx 9


In [3]:
if not os.path.exists("results"):
    os.mkdir("results")

## Params

In [4]:
units = [64, 64, 64]
activations = ['relu', 'relu', 'linear']
lr = 0.001  # varies
num_episodes = 50 
num_trajectories = 10 # varies
num_eval = 10
delta_std = 0.2 # varies
num_cuts = 10
gamma = 1.

run_name = "easy8"

if not os.path.exists(f"results/{run_name}"):
    os.mkdir(f"results/{run_name}")

all_params = {
    "units": units, "activations": activations, "lr": lr, "num_episodes": num_episodes,
    "num_trajectories": num_trajectories, "delta_std": delta_std, "num_cuts": num_cuts, "gamma": gamma
}
np.save(f"results/{run_name}/params", all_params)

In [6]:
#learning rate, num_trajectories, delta_std
preset1 = 0.1, 10, 0.30
preset2 = 0.01, 10, 0.20
preset3 = 0.001, 10, 0.15
preset4 = 0.0001, 10, 0.10

def get_params(prev_reward):
    if prev_reward <= 0.2:
        return preset1
    elif prev_reward <= 0.4:
        return preset2
    elif prev_reward <= 0.8:
        return preset3
    else:
        return preset4

## Evolutionary Strategy

In [7]:
%%wandb
# initialize policy and test
policy = Policy(units, activations, lr)
s = env.reset()
_ = policy.compute_prob(s)

optimizers = [AdamOptimizer(lr=lr) for _ in range(len(policy.get_weights()))]
rewards_record = []
prev_reward = 0.

for e in range(num_episodes):
    lr, num_trajectories, delta_std = get_params(prev_reward)
    for i in range(len(optimizers)):
        optimizers[i].lr = lr
    start_t = time.time()
    print(f"Episode {e}")
    w_orig = policy.get_weights()
    
    epsilons = []
    rewards_table = np.zeros(num_trajectories)
    print(f"Simulating {num_trajectories} trajectories...")
    
    for t in range(num_trajectories):
        eps = [np.random.randn(*x.shape)*delta_std for x in w_orig]
        w_new = [w_orig[i] + eps[i] for i in range(len(w_orig))]
        policy.set_weights(w_new)
        rewards, states, actions = rollout(env, policy, num_cuts, gamma)
        epsilons.append(eps)
        rewards_table[t] = np.sum(rewards)
    
    rewards_table_norm = (rewards_table - np.mean(rewards_table))/(np.std(rewards_table) + 1e-8)

    grads = []
    print("Estimating gradient...")
    for j in range(len(w_orig)):
        arr = np.zeros(epsilons[0][j].shape)
        for i in range(len(epsilons)):
            arr += epsilons[i][j] * rewards_table[i]
        arr /= (len(epsilons) * delta_std)
        grads.append(arr)
    
    # new_w = [w_orig[i] - lr*grads[i] for i in range(len(w_orig))]
    new_w = [optimizers[i].update(w_orig[i], grads[i]) for i in range(len(w_orig))]
    
    policy.set_weights(new_w)
    print("Evaluating rewards...")
    
    eval_r, _, _ = rollout_multiple(env, policy, num_eval, 50, gamma=1.)
    eval_r = np.array(eval_r).sum(axis=1)
    print("Evaluated rewards: %.4f" % np.mean(eval_r))
    print('mean',np.mean(eval_r),'max',np.max(eval_r),'min',np.min(eval_r),'std',np.std(eval_r))
    print("")
    rewards_record.append(np.mean(eval_r))
    
    
    fixedWindow = 10
    if len(rewards_record) >= fixedWindow:
        movingAverage = np.mean(rewards_record[len(rewards_record) - fixedWindow:len(rewards_record)])
    else:
        movingAverage = np.mean(rewards_record)
    
    wandb.log({f"Average training reward over trajectories": np.mean(eval_r), f"Training reward moving average ({fixedWindow} episodes)": movingAverage})
    np.save(f"results/{run_name}/reward{e}", eval_r)
    prev_reward = np.mean(eval_r)

Restricted license - for non-production use only - expires 2024-10-28


2022-12-15 05:39:23.916223: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2022-12-15 05:39:23.916295: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-15 05:39:23.916328: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (learn2cut): /proc/driver/nvidia/version does not exist
2022-12-15 05:39:23.917437: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the approp

Episode 0
Simulating 10 trajectories...
Estimating gradient...
Evaluating rewards...
Evaluated rewards: 0.9086
mean 0.9085830982820425 max 1.607247807321528 min 0.6151751669403893 std 0.28708528045573445

Episode 1
Simulating 10 trajectories...
Estimating gradient...
Evaluating rewards...
Evaluated rewards: 0.0400
mean 0.039971769813405444 max 0.08215683227467707 min 0.004521856069004571 std 0.025343624022779454

Episode 2
Simulating 10 trajectories...
Estimating gradient...
Evaluating rewards...
Evaluated rewards: 0.0379
mean 0.037942022016227386 max 0.08004191531017568 min 0.004521856069004571 std 0.02415962822367946

Episode 3
Simulating 10 trajectories...
Estimating gradient...
Evaluating rewards...
Evaluated rewards: 0.0397
mean 0.03966161825123891 max 0.11043128588562467 min 0.004521856069004571 std 0.02974455885355897

Episode 4
Simulating 10 trajectories...
Estimating gradient...
Evaluating rewards...


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



error in lp iteration
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3552, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_11216/434765772.py", line 51, in <module>
    eval_r, _, _ = rollout_multiple(env, policy, num_eval, 50, gamma=1.)
  File "/home/jupyter/Project_learn2cut/rollout.py", line 74, in rollout_multiple
    r, s, a = rollout(env, policy, num_cuts, gamma)
  File "/home/jupyter/Project_learn2cut/rollout.py", line 55, in rollout
    s = env.reset()
  File "/home/jupyter/Project_learn2cut/gymenv_v2.py", line 161, in reset
    return self.env_now.reset()
  File "/home/jupyter/Project_learn2cut/gymenv_v2.py", line 176, in reset
    return self.env.reset()
  File "/home/jupyter/Project_learn2cut/gymenv_v2.py", line 107, in reset
    s, d = self._reset()
  File "/home/jupyter/Project_learn2cut/gymenv_v2.py", line 103, in _reset
    self.A,self.b,self.cuts_a,self.cu

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3552, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_11216/434765772.py", line 51, in <module>
    eval_r, _, _ = rollout_multiple(env, policy, num_eval, 50, gamma=1.)
  File "/home/jupyter/Project_learn2cut/rollout.py", line 74, in rollout_multiple
    r, s, a = rollout(env, policy, num_cuts, gamma)
  File "/home/jupyter/Project_learn2cut/rollout.py", line 55, in rollout
    s = env.reset()
  File "/home/jupyter/Project_learn2cut/gymenv_v2.py", line 161, in reset
    return self.env_now.reset()
  File "/home/jupyter/Project_learn2cut/gymenv_v2.py", line 176, in reset
    return self.env.reset()
  File "/home/jupyter/Project_learn2cut/gymenv_v2.py", line 107, in reset
    s, d = self._reset()
  File "/home/jupyter/Project_learn2cut/gymenv_v2.py", line 103, in _reset
    self.A,self.b,self.cuts_a,self.cuts_b,self.done,self.ol

TypeError: object of type 'NoneType' has no len()

## Evaluation performance

In [8]:
num_envs_eval = 100

In [9]:
%%wandb
test_rewards = []
test_rewards_sum = []
movingAverage = 0
averageWindow = 10
for i in range(num_envs_eval):
    rewards, states, actions = rollout(env, policy, 50, 1.)
    test_rewards.append(rewards)
    test_rewards_sum.append(np.sum(rewards))
    if i >= averageWindow:
        movingAverage = np.mean(test_rewards_sum[i - averageWindow:i])
    
    wandb.log(
        {
            f"Test training reward": np.sum(rewards),
            f"Test moving average reward": movingAverage
        }
    )
np.save(f"results/{run_name}/eval", test_rewards)

KeyboardInterrupt: 

## Policy Gradient Algorithm

In [53]:
%%wandb
# initialize policy and test
policy = Policy(units, activations, lr)
s = env.reset()
_ = policy.compute_prob(s)

rewards_record = []
prev_reward = 0.

for e in range(num_episodes):
    lr, num_trajectories, delta_std = get_params(prev_reward)
    policy.optimizer.learning_rate = lr
    start_t = time.time()
    print(f"Episode {e}")
    
    epsilons = []
    rewards_table = []
    states_train = []
    actions_train = []
    print(f"Simulating {num_trajectories} trajectories...")
    
    for t in range(num_trajectories):
        rewards, states, actions = rollout(env, policy, num_cuts, gamma)
        rewards_table.append(np.flip(np.flip(rewards).cumsum()))
        states_train.append([[aux2.astype(float) for aux2 in aux1] for aux1 in states])
        actions_train.append(actions)
    
    print("Estimating gradient...")
    for s_, r_, a_ in zip(states_train, rewards_table, actions_train):
        _, _ = policy.train(s_, r_, a_)
        
    print("Evaluating rewards...")
    
    eval_r, _, _ = rollout_multiple(env, policy, num_eval, 50, gamma=1.)
    eval_r = np.array(eval_r).sum(axis=1)
    print("Evaluated rewards: %.4f" % np.mean(eval_r))
    print('mean',np.mean(eval_r),'max',np.max(eval_r),'min',np.min(eval_r),'std',np.std(eval_r))
    print("")
    rewards_record.append(np.mean(eval_r))
    
    
    fixedWindow = 10
    if len(rewards_record) >= fixedWindow:
        movingAverage = np.mean(rewards_record[len(rewards_record) - fixedWindow:len(rewards_record)])
    else:
        movingAverage = np.mean(rewards_record)
    
    wandb.log({f"Average training reward over trajectories": np.mean(eval_r), f"Training reward moving average ({fixedWindow} episodes)": movingAverage})
    np.save(f"results/{run_name}/reward{e}", eval_r)
    prev_reward = np.mean(eval_r)

Episode 0
Simulating 10 trajectories...
Evaluating rewards...
Evaluated rewards: 0.8700
mean 0.8699557146871257 max 1.1153490818669525 min 0.6072478073228922 std 0.21352661094946299

Episode 1
Simulating 10 trajectories...
Evaluating rewards...
Evaluated rewards: 0.9247
mean 0.9246550770747944 max 1.1153490818669525 min 0.6072478073228922 std 0.205001931851864

Episode 2
Simulating 10 trajectories...
Evaluating rewards...
Evaluated rewards: 0.8560
mean 0.8560136951227151 max 1.1153490818669525 min 0.6151751669449368 std 0.18506951704997698

Episode 3
Simulating 10 trajectories...
Evaluating rewards...
Evaluated rewards: 0.8258
mean 0.8258302526727448 max 1.1153490818669525 min 0.6072478073228922 std 0.1792097475877898

Episode 4
Simulating 10 trajectories...
Evaluating rewards...
Evaluated rewards: 0.8878
mean 0.8877926989853904 max 1.1153490818669525 min 0.6072478073228922 std 0.20349934508706363

Episode 5
Simulating 10 trajectories...
error in lp iteration


KeyboardInterrupt: 

In [62]:
t = time.time()

In [63]:
t1 = time.time()

In [59]:
(t1 - t)/60

0.10902396440505982

In [65]:
print("Time elapsed: %.4f minutes" % ((t1 - t)/60))

Time elapsed: 0.0231 minutes


In [164]:
weights = []

In [165]:
# initialize policy and test
policy = Policy(units, activations, lr=0.01)
s = env.reset()
_ = policy.compute_prob(s)
rewards_record = []

for e in range(num_episodes):
    weights.append(policy.get_weights())
    rewards, states, actions = [], [], []
    for t in range(num_trajectories):
        r_, s_, a_ = rollout(env, policy, num_cuts, gamma)
        rewards.append(r_)
        states.append(s_)
        actions.append(a_)
    
    for state, reward, action in zip(states, rewards, actions):
        loss, gs = policy.train(state, reward, action)
    
    # evaluate rewards
    eval_r, _, _ = rollout(env, policy, num_cuts, gamma)

    print(f"Episode {e}:")
    print('mean',np.mean(eval_r),'max',np.max(eval_r),'min',np.min(eval_r),'std',np.std(eval_r))
    print("")
    rewards_record.append(np.mean(eval_r))
    
#     fixedWindow = 100
#     movingAverage = 0
#     if len(rewards_record) >= fixedWindow:
#         movingAverage = np.mean(rewards_record[len(rewards_record)-fixedWindow:len(rewards_record)-1])
        
    # wandb.log({"Training reward" : float(rewards_record[-1]), "Training reward moving average": movingAverage})
    # np.save(f"results/{run_name}/reward{e}", eval_r)

Episode 0:
mean 0.03022674249687125 max 0.04331369729237188 min 0.015532067551566798 std 0.01081458051292926

Episode 1:
mean 0.009053976514660526 max 0.017542173560850367 min 1.7402659977960866e-05 std 0.0071892081363139244

Episode 2:
mean 0.008553477670708624 max 0.02582539067602243 min 0.0006530065411425312 std 0.008436587447415399

Episode 3:
mean 0.02555015917074782 max 0.04136805025977598 min 0.010084274502332846 std 0.011244516942115647

error in lp iteration
Episode 4:
mean 1.4543466022587381e-05 max 2.9086932045174763e-05 min 0.0 std 1.4543466022587381e-05

error in lp iteration


KeyboardInterrupt: 

In [161]:
weights[1][3]

<tf.Variable 'attention_embedding_9/Variable:0' shape=(64,) dtype=float32, numpy=
array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.01591764,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.     

In [162]:
weights[2][3]

<tf.Variable 'attention_embedding_9/Variable:0' shape=(64,) dtype=float32, numpy=
array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.01591764,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.     

In [134]:
rewards, states, actions = rollout(env.envs[0], policy=policy, num_cuts=5, gamma=0.9)

In [135]:
d_rewards = discounted_rewards(rewards, 0.9)

In [136]:
loss, g = policy.train(states, d_rewards, actions)

In [125]:
state, reward, action = states[0], d_rewards[0], actions[0]

In [126]:
prob = tf.cast(tf.nn.softmax(policy.attention(state), axis=-1), tf.double)

In [128]:
action_onehot = tf.cast(tf.one_hot(action, len(prob)), tf.double)

In [132]:
-tf.reduce_mean(reward*tf.reduce_sum(prob * action_onehot, axis=-1))

<tf.Tensor: shape=(), dtype=float64, numpy=-0.0013493157667902656>

In [99]:
wnew = policy.get_weights()

In [101]:
worig[0]

<tf.Variable 'attention_embedding_5/Variable:0' shape=(61, 64) dtype=float32, numpy=
array([[ 0.02765853,  0.02115008, -0.04661967, ..., -0.00032009,
         0.06772545, -0.15605754],
       [ 0.12472709,  0.03117213, -0.0408311 , ...,  0.09616747,
        -0.04850346, -0.01543978],
       [ 0.11055695,  0.04528916, -0.10836247, ..., -0.07388528,
         0.02407323,  0.04287877],
       ...,
       [ 0.01313807,  0.06044581,  0.01039451, ..., -0.02181788,
         0.03599485, -0.0163053 ],
       [-0.02584703,  0.0009844 ,  0.05448824, ..., -0.03947347,
         0.03312697, -0.06217254],
       [-0.06193162,  0.04476308,  0.06298922, ...,  0.06827538,
        -0.03702519,  0.0773042 ]], dtype=float32)>

In [103]:
g[0]

[<tf.Tensor: shape=(61, 64), dtype=float32, numpy=
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(61, 64), dtype=float32, numpy=
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(64,), dtype=float32, numpy=
 array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  3.4412409e-08, -6.1519186e-11,  0.0000000e+00,
         4.7996324e-10,  0.0000000e+00,  0.0000000e+00,  7.6437134e-10,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00, -3.2961063e-05,
         0.0000000e+00,  0.00000

In [79]:
prob = policy.compute_prob(states[0])

In [82]:
prob = tf.cast(prob, tf.double)


In [85]:
action_onehot = tf.cast(tf.one_hot(actions[0], len(prob)), tf.double)

In [86]:
action_onehot

<tf.Tensor: shape=(60,), dtype=float64, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])>

In [89]:
prob_selected = tf.reduce_sum(prob * action_onehot, axis=-1)

In [91]:
prob_selected += 1e-8

In [94]:
import tensorflow_probability as tfp

In [97]:
dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)

In [98]:
dist.log_prob(actions[0])

<tf.Tensor: shape=(), dtype=float64, numpy=-4.0943445622221>

In [49]:
tf.convert_to_tensor(rewards)

<tf.Tensor: shape=(10, 6), dtype=float64, numpy=
array([[1.13376639e-03, 3.08404304e-02, 3.76251281e-02, 4.88329207e-02,
        5.19279231e-02, 5.60047287e-02],
       [1.35804102e-03, 1.33847005e-02, 1.52768998e-02, 2.73561120e-02,
        3.28219100e-02, 3.48714200e-02],
       [2.89780385e-04, 2.27100842e-03, 7.88102197e-03, 8.04201753e-03,
        8.69801831e-03, 9.77143384e-03],
       [1.13673226e-02, 3.98705513e-02, 6.95828604e-02, 7.81896013e-02,
        8.58904937e-02, 9.09576944e-02],
       [2.13645849e-02, 3.41761543e-02, 5.56414986e-02, 5.59696274e-02,
        5.63511518e-02, 5.64184296e-02],
       [5.00194310e-04, 1.05171533e-03, 4.28482347e-02, 5.16368156e-01,
        5.16368156e-01, 5.16368156e-01],
       [8.82255726e-02, 9.50893077e-02, 1.04324512e-01, 1.04337391e-01,
        1.12804798e-01, 1.15944789e-01],
       [9.45784145e-03, 1.23628241e-02, 2.51291122e-02, 2.62451570e-02,
        2.72499772e-02, 2.74800479e-02],
       [9.77637083e-05, 1.24941829e-04, 1.21762

In [23]:
# initialize policy and test
policy = Policy(units, activations)
s = env.reset()
_ = policy.compute_prob(s)
rewards_record = []

for e in range(num_episodes):
    s = env.reset()
    w_orig_cons, w_orig_cuts = policy.get_weights()
    for t in range(num_trajectories):
        epsilon_cons = [np.random.randn(*x.shape)*delta_std for x in w_orig_cons]
        epsilon_cuts = [np.random.randn(*x.shape)*delta_std for x in w_orig_cuts]
        w_new_cons = [w_orig_cons[i] + epsilon_cons[i] for i in range(len(w_orig_cons))]
        w_new_cuts = [w_orig_cuts[i] + epsilon_cuts[i] for i in range(len(w_orig_cuts))]
        policy.set_weights(w_new_cons, w_new_cuts)
        rewards, times = rollout_env(env=env.envs, policy=policy, num_rollouts=1, rollout_length=rollout_length, gamma=gamma)
        epsilon_table_cons.append(epsilon_cons)
        epsilon_table_cuts.append(epsilon_cuts)
        # epsilon_table.append(epsilon)
        train_rewards_table.append(np.mean(rewards))
    
    train_rewards_table = np.array(train_rewards_table)
    train_rewards_table = (train_rewards_table - np.mean(train_rewards_table))/ (np.std(train_rewards_table) + 1e-8)

    grads_cons = []
    grads_cuts = []
    for j in range(len(w_orig_cons)):
        arr_cons = np.zeros(epsilon_table_cons[0][j].shape)
        arr_cuts = np.zeros(epsilon_table_cuts[0][j].shape)
        for i in range(len(epsilon_table_cons)):
            arr_cons += epsilon_table_cons[i][j] * train_rewards_table[i]
            arr_cuts += epsilon_table_cuts[i][j] * train_rewards_table[i]
        arr_cons /= (len(epsilon_table_cons) * delta_std)
        arr_cuts /= (len(epsilon_table_cuts) * delta_std)
        grads_cons.append(arr_cons)
        grads_cuts.append(arr_cuts)
    
    # assign back original weights and update
    w_cons = [w_orig_cons[i] - lr*grads_cons[i] for i in range(len(w_orig_cons))]
    w_cuts = [w_orig_cuts[i] - lr*grads_cuts[i] for i in range(len(w_orig_cuts))]

    policy.set_weights(w_cons, w_cuts)
    
    # evaluate rewards
    eval_r, _ = rollout_envs(envs=env.envs, policy=policy, num_rollouts=1, rollout_length=time_limit, gamma=gamma)
    print(f"Episode {e}:")
    print('mean',np.mean(eval_r),'max',np.max(eval_r),'min',np.min(eval_r),'std',np.std(eval_r))
    print("")
    rewards_record.append(np.mean(eval_r))
    
    fixedWindow = 100
    movingAverage = 0
    if len(rewards_record) >= fixedWindow:
        movingAverage = np.mean(rewards_record[len(rewards_record)-fixedWindow:len(rewards_record)-1])
        
    wandb.log({"Training reward" : float(rewards_record[-1]), "Training reward moving average": movingAverage})
    np.save(f"results/{run_name}/reward{e}", eval_r)

0.023704249999999996

In [24]:
(1-(0.6**2))/200

0.0032